博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
2014-05-02
阅读量:4674 次
发布时间:2019-06-09

本文共 4122 字,大约阅读时间需要 13 分钟。

#!/usr/bin/perl

use utf8;

use Data::Dumper qw(Dumper);
use HTML::Element;
use HTML::TreeBuilder;

#binmode(STDIN,':encoding(utf8)');

#binmode(STDOUT,':encoding(utf8)');
binmode STDOUT,"utf8";
#binmode(STDERR,':encoding(utf8)');
$Data::Dumper::Indent = 1 ;

#foreach my $file_name (@ARGV){

my $file_name = "huxiu-webDetail";
unless(-e $file_name){
    print "$file_name is not exsit\n";
}
    open(DATA ,$file_name);   
    binmode DATA,"utf8";
    my $tree = HTML::TreeBuilder->new;
    $tree->parse_file(\*DATA);
   
#    $title = $tree->find_by_tag_name('title');
#    @desc = $tree->find_by_tag_name('description');
#    @link = $tree->find_by_tag_name('link');
#    @image = $tree->find_by_tag_name('image');
   
#    foreach(@title){
#        print $title,"\n";
#    }
#    $title = $tree->find_by_tag_name('title');
    $head = $tree->find_by_tag_name("head");
    $body = $tree->find_by_tag_name("body");
#    @metacontent = $meta->content_list;
#    print $head->{'_parent'}{'_content'}[1]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'src'},"\n";
   
# _parent is a key of Hash,and the value is ref hash Array.
#_content is a key of Arry,and the value is ref hash Arry.
    $var_par = $head->{'_parent'};
    $var_con = $head->{'_content'};
    $var_tag = $head->{'_tag'};

    foreach $key( keys %head)

    {
#        print $key,"\n";
    }
#    print $var_con;
    foreach $key(keys %$var_par)
    {
#        print $key,"\n";
    }
    while(($key,$value)=each%$var_par)
    {
#        print "$key=>$value\n";
    }
#########################################################
#                                                        #
# print ALL Hash key and Hash value in  Head`s _content #
#                                                        #
#########################################################
   
    print "=========================================\n";
    my $icon_count = @$var_con - 1;
    for my $i (0 .. $icon_count)
    {
        my $hash = $var_con->[$i];
        foreach my $key(keys %$hash)
        {
#            print $key,"\n";
        }
    }
    print "========================================\n";
#    foreach $key(keys %($var_con[0]))
#    {
#        print $key,"\n";
#    }
#    foreach $key(keys (%$var_tag))
#    {
#        print $key,"\n";
#    }
#    foreach $key(keys %$body)
#    {
#        print $keys,"\n";
#    }

    print $var_par->{'_content'},"\n";

    print $var_con,"\n";
    print $var_tag,"\n";
#    print $i=@$var_con,"\n";
#    print $var_par->{'_content'}[0],"\n";
#    print $var_par->{'_content'}[0]{'_content'}[0],"\n";
#    print $var_con->[0],"\n";
    sub printcontent{
        my $vax = @_->[0];
        my $tag = @_->[1];
        my $icount = @$vax-1;
#        print $icount+1,"\n";
#        print  $vax->[0],"\n";
        for my $i(0 .. $icount){
#            print  $i,$vax->[$i],"\n";
#            print $i,$vax->[$i]{'_tag'},"\n";
#            if( @$vax->[$i]{'_content'}!=())
#            {
#                print $i,":";
#                printcontent ($vax->[$i]{'_content'});
#            }
#            elsif($vax->[$i]{'content'}!=undef)
#            {
#                print $i,":";
#                printcontent ($vax[$i]{'content'});
#            }
#            else
#            {
                my $hash = $vax->[$i];
                foreach my $key(keys %$hash)
                {
                    if($key ne "_parent"){
                        print $i,":",$key,"=";
                        print $vax->[$i]{$key},"\n";
                    }
                    elsif($key ==  '_content')
                    {
#                        Dumper $key,"\n";
                        if(@$vax->[$i]{'_content'}[0]{'_content'}!=()){
#                            print $i,":_content=",$vax->[$i]{'_content'}[0],"\n";
                            printcontent($vax->[$i]{'_content'});
                        }
                        else{
                            print $i,":_content============",$vax->[$i]{'_content'}[0],"\n";
                        }
                    }
                }
#            }
        }
    }
#    printcontent($var_par->{'_content'});

    printcontent ($var_par->{'_content'});

    print "\n";
#    print  $head->{'_parent'}{'_content'}[1]{'_content'}[0]{'_content'}[1]{'_content'}[0]{'_content'}[0]{'_content'}[1]{'_content'}[0]{'_content'}[0],"\n";
#    print $var_con->[1]{'_content'}[0],"\n";
#    print $var_con->[2]{'content'},"\n";
#    print $var_con->[2]{'_tag'},"\n";
#    print $t=@$var;
#    print Dumper($head);
#    foreach( @metacontent)
#    {
#        print $_,"\n";
#    }
   
   
#    print Dumper($tree), "\n";
#    print $title->as_text(),"\n";
#    print $body->as_text(),"\n";
#    :q@p = $tree->find_by_tag_name("body")->content_list;
#    @headcontent = $head->content_list;
#    @bodycontent = $body->content_list;   
   
#    print Dumper(@headcontent);
#    print Dumper(@bodycontent),"\n";
#    foreach(@headcontent)
#    {
#        print $_->as_text(),"\n" ;
#    }

    $tree = $tree->delete;

    close(DATA);
#}

 

功能

把HTML标签转化为perl的数据结构

找出tag和对应的值。

能够攫取网页内容与格式。

 

不足:

_content会多打一个,要在第一个if语句中过滤。小问题。这种类型的还比较多

找不到内容对应的原来格式。即没有做内容与原来格式的关联。大问题。功能不完善。下一步的重点。

转载于:https://www.cnblogs.com/ppazhang/p/3703573.html

你可能感兴趣的文章
VS2010上连接SQLite数据库
查看>>
Oracle数据库安装图文操作步骤
查看>>
贪心算法的简单理解
查看>>
Linux性能检测常用的10个基本命令
查看>>
Mac上传代码到Github
查看>>
day80 django模版学习
查看>>
Java实现注册邮箱激活验证
查看>>
Windows Phone 7 Belling‘s课堂(一) 磁贴的学习
查看>>
WPF 位置转化和动画
查看>>
【log4net】配置文件
查看>>
网易2017春招笔试真题编程题集合
查看>>
玩一下易语言 "和"字有多种读音,注定了它的重要性!!
查看>>
Python中的单例模式的几种实现方式的及优化
查看>>
【转】hadoop机架感知
查看>>
Oracle、DB2、SQLSERVER、Mysql、Access分页SQL语句梳理
查看>>
UVa 10806 Dijkstra,Dijkstra(最小费用最大流)
查看>>
Java排序算法之直接选择排序
查看>>
《鸟哥的Linux私房菜 基础学习篇(第四版)》 第23章 XWindows设定介绍 笔记
查看>>
js同比例缩放图片
查看>>
本机不装Oracle,使用plsql连接远程Oracle的方法
查看>>