• 2014-05-02


    #!/usr/bin/perl

    use utf8;
    use Data::Dumper qw(Dumper);
    use HTML::Element;
    use HTML::TreeBuilder;

    #binmode(STDIN,':encoding(utf8)');
    #binmode(STDOUT,':encoding(utf8)');
    binmode STDOUT,"utf8";
    #binmode(STDERR,':encoding(utf8)');
    $Data::Dumper::Indent = 1 ;

    #foreach my $file_name (@ARGV){
    my $file_name = "huxiu-webDetail";
    unless(-e $file_name){
        print "$file_name is not exsit ";
    }
        open(DATA ,$file_name);   
        binmode DATA,"utf8";
        my $tree = HTML::TreeBuilder->new;
        $tree->parse_file(*DATA);
       
    #    $title = $tree->find_by_tag_name('title');
    #    @desc = $tree->find_by_tag_name('description');
    #    @link = $tree->find_by_tag_name('link');
    #    @image = $tree->find_by_tag_name('image');
       
    #    foreach(@title){
    #        print $title," ";
    #    }
    #    $title = $tree->find_by_tag_name('title');
        $head = $tree->find_by_tag_name("head");
        $body = $tree->find_by_tag_name("body");
    #    @metacontent = $meta->content_list;
    #    print $head->{'_parent'}{'_content'}[1]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'_content'}[0]{'src'}," ";
       
    # _parent is a key of Hash,and the value is ref hash Array.
    #_content is a key of Arry,and the value is ref hash Arry.
        $var_par = $head->{'_parent'};
        $var_con = $head->{'_content'};
        $var_tag = $head->{'_tag'};

        foreach $key( keys %head)
        {
    #        print $key," ";
        }
    #    print $var_con;
        foreach $key(keys %$var_par)
        {
    #        print $key," ";
        }
        while(($key,$value)=each%$var_par)
        {
    #        print "$key=>$value ";
        }
    #########################################################
    #                                                        #
    # print ALL Hash key and Hash value in  Head`s _content #
    #                                                        #
    #########################################################
       
        print "========================================= ";
        my $icon_count = @$var_con - 1;
        for my $i (0 .. $icon_count)
        {
            my $hash = $var_con->[$i];
            foreach my $key(keys %$hash)
            {
    #            print $key," ";
            }
        }
        print "======================================== ";
    #    foreach $key(keys %($var_con[0]))
    #    {
    #        print $key," ";
    #    }
    #    foreach $key(keys (%$var_tag))
    #    {
    #        print $key," ";
    #    }
    #    foreach $key(keys %$body)
    #    {
    #        print $keys," ";
    #    }

        print $var_par->{'_content'}," ";
        print $var_con," ";
        print $var_tag," ";
    #    print $i=@$var_con," ";
    #    print $var_par->{'_content'}[0]," ";
    #    print $var_par->{'_content'}[0]{'_content'}[0]," ";
    #    print $var_con->[0]," ";
        sub printcontent{
            my $vax = @_->[0];
            my $tag = @_->[1];
            my $icount = @$vax-1;
    #        print $icount+1," ";
    #        print  $vax->[0]," ";
            for my $i(0 .. $icount){
    #            print  $i,$vax->[$i]," ";
    #            print $i,$vax->[$i]{'_tag'}," ";
    #            if( @$vax->[$i]{'_content'}!=())
    #            {
    #                print $i,":";
    #                printcontent ($vax->[$i]{'_content'});
    #            }
    #            elsif($vax->[$i]{'content'}!=undef)
    #            {
    #                print $i,":";
    #                printcontent ($vax[$i]{'content'});
    #            }
    #            else
    #            {
                    my $hash = $vax->[$i];
                    foreach my $key(keys %$hash)
                    {
                        if($key ne "_parent"){
                            print $i,":",$key,"=";
                            print $vax->[$i]{$key}," ";
                        }
                        elsif($key ==  '_content')
                        {
    #                        Dumper $key," ";
                            if(@$vax->[$i]{'_content'}[0]{'_content'}!=()){
    #                            print $i,":_content=",$vax->[$i]{'_content'}[0]," ";
                                printcontent($vax->[$i]{'_content'});
                            }
                            else{
                                print $i,":_content============",$vax->[$i]{'_content'}[0]," ";
                            }
                        }
                    }
    #            }
            }
        }
    #    printcontent($var_par->{'_content'});

        printcontent ($var_par->{'_content'});
        print " ";
    #    print  $head->{'_parent'}{'_content'}[1]{'_content'}[0]{'_content'}[1]{'_content'}[0]{'_content'}[0]{'_content'}[1]{'_content'}[0]{'_content'}[0]," ";
    #    print $var_con->[1]{'_content'}[0]," ";
    #    print $var_con->[2]{'content'}," ";
    #    print $var_con->[2]{'_tag'}," ";
    #    print $t=@$var;
    #    print Dumper($head);
    #    foreach( @metacontent)
    #    {
    #        print $_," ";
    #    }
       
       
    #    print Dumper($tree), " ";
    #    print $title->as_text()," ";
    #    print $body->as_text()," ";
    #    :q@p = $tree->find_by_tag_name("body")->content_list;
    #    @headcontent = $head->content_list;
    #    @bodycontent = $body->content_list;   
       
    #    print Dumper(@headcontent);
    #    print Dumper(@bodycontent)," ";
    #    foreach(@headcontent)
    #    {
    #        print $_->as_text()," " ;
    #    }

        $tree = $tree->delete;
        close(DATA);
    #}

    功能

    把HTML标签转化为perl的数据结构

    找出tag和对应的值。

    能够攫取网页内容与格式。

    不足:

    _content会多打一个,要在第一个if语句中过滤。小问题。这种类型的还比较多

    找不到内容对应的原来格式。即没有做内容与原来格式的关联。大问题。功能不完善。下一步的重点。

  • 相关阅读:
    PPR的断管
    排水地漏的功能与种类
    PPR管及管件的类型、规格与选用
    水龙头的安装、拆卸与阀芯更换
    为不同的用户生成不同的 Kibana 界面
    如何让匿名的用户访问受限的资源
    Beats processors
    Elasticsearch 开发入门
    Elasticsearch Dockerfile 例子
    燃气热水器的结构与安装
  • 原文地址:https://www.cnblogs.com/ppazhang/p/3703573.html
Copyright © 2020-2023  润新知