#!/usr/bin/perl
use utf8;
use Data::Dumper qw(Dumper); use HTML::Element; use HTML::TreeBuilder;#binmode(STDIN,':encoding(utf8)');
#binmode(STDOUT,':encoding(utf8)'); binmode STDOUT,"utf8"; #binmode(STDERR,':encoding(utf8)'); $Data::Dumper::Indent = 1 ;#foreach my $file_name (@ARGV){
my $file_name = "huxiu-webDetail"; unless(-e $file_name){ print "$file_name is not exsit\n"; } open(DATA ,$file_name); binmode DATA,"utf8"; my $tree = HTML::TreeBuilder->new; $tree->parse_file(\*DATA); # $title = $tree->find_by_tag_name('title'); # @desc = $tree->find_by_tag_name('description'); # @link = $tree->find_by_tag_name('link'); # @image = $tree->find_by_tag_name('image'); # foreach(@title){ # print $title,"\n"; # } # $title = $tree->find_by_tag_name('title'); $head = $tree->find_by_tag_name("head"); $body = $tree->find_by_tag_name("body"); # @metacontent = $meta->content_list; # print $head->{'_parent'}{'_content'}[1]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'src'},"\n"; # _parent is a key of Hash,and the value is ref hash Array. #_content is a key of Arry,and the value is ref hash Arry. $var_par = $head->{'_parent'}; $var_con = $head->{'_content'}; $var_tag = $head->{'_tag'};foreach $key( keys %head)
{ # print $key,"\n"; } # print $var_con; foreach $key(keys %$var_par) { # print $key,"\n"; } while(($key,$value)=each%$var_par) { # print "$key=>$value\n"; } ######################################################### # # # print ALL Hash key and Hash value in Head`s _content # # # ######################################################### print "=========================================\n"; my $icon_count = @$var_con - 1; for my $i (0 .. $icon_count) { my $hash = $var_con->[$i]; foreach my $key(keys %$hash) { # print $key,"\n"; } } print "========================================\n"; # foreach $key(keys %($var_con[0])) # { # print $key,"\n"; # } # foreach $key(keys (%$var_tag)) # { # print $key,"\n"; # } # foreach $key(keys %$body) # { # print $keys,"\n"; # }print $var_par->{'_content'},"\n";
print $var_con,"\n"; print $var_tag,"\n"; # print $i=@$var_con,"\n"; # print $var_par->{'_content'}[0],"\n"; # print $var_par->{'_content'}[0]{'_content'}[0],"\n"; # print $var_con->[0],"\n"; sub printcontent{ my $vax = @_->[0]; my $tag = @_->[1]; my $icount = @$vax-1; # print $icount+1,"\n"; # print $vax->[0],"\n"; for my $i(0 .. $icount){ # print $i,$vax->[$i],"\n"; # print $i,$vax->[$i]{'_tag'},"\n"; # if( @$vax->[$i]{'_content'}!=()) # { # print $i,":"; # printcontent ($vax->[$i]{'_content'}); # } # elsif($vax->[$i]{'content'}!=undef) # { # print $i,":"; # printcontent ($vax[$i]{'content'}); # } # else # { my $hash = $vax->[$i]; foreach my $key(keys %$hash) { if($key ne "_parent"){ print $i,":",$key,"="; print $vax->[$i]{$key},"\n"; } elsif($key == '_content') { # Dumper $key,"\n"; if(@$vax->[$i]{'_content'}[0]{'_content'}!=()){ # print $i,":_content=",$vax->[$i]{'_content'}[0],"\n"; printcontent($vax->[$i]{'_content'}); } else{ print $i,":_content============",$vax->[$i]{'_content'}[0],"\n"; } } } # } } } # printcontent($var_par->{'_content'});printcontent ($var_par->{'_content'});
print "\n"; # print $head->{'_parent'}{'_content'}[1]{'_content'}[0]{'_content'}[1]{'_content'}[0]{'_content'}[0]{'_content'}[1]{'_content'}[0]{'_content'}[0],"\n"; # print $var_con->[1]{'_content'}[0],"\n"; # print $var_con->[2]{'content'},"\n"; # print $var_con->[2]{'_tag'},"\n"; # print $t=@$var; # print Dumper($head); # foreach( @metacontent) # { # print $_,"\n"; # } # print Dumper($tree), "\n"; # print $title->as_text(),"\n"; # print $body->as_text(),"\n"; # :q@p = $tree->find_by_tag_name("body")->content_list; # @headcontent = $head->content_list; # @bodycontent = $body->content_list; # print Dumper(@headcontent); # print Dumper(@bodycontent),"\n"; # foreach(@headcontent) # { # print $_->as_text(),"\n" ; # }$tree = $tree->delete;
close(DATA); #}
功能
把HTML标签转化为perl的数据结构
找出tag和对应的值。
能够攫取网页内容与格式。
不足:
_content会多打一个,要在第一个if语句中过滤。小问题。这种类型的还比较多
找不到内容对应的原来格式。即没有做内容与原来格式的关联。大问题。功能不完善。下一步的重点。