Board logo

标题: [原创代码] [Perl]批量下载美女壁纸(ZOL桌面壁纸) [打印本页]

作者: 523066680    时间: 2018-11-1 11:19     标题: [Perl]批量下载美女壁纸(ZOL桌面壁纸)

本帖最后由 523066680 于 2018-11-1 11:23 编辑

最近需要素材便写了,没有加入多线程,就这样按顺序抓~
如果因为某种原因中断了,重新开始,会判断已完成的部分节省时间。

keep_alive 打开后好像会导致后续页面访问不了,所以没开。

运行环境: Straberry Perl 5.24
  1. =info
  2.     Author: 523066680/vicyang
  3.     Date: 2018-11
  4. =cut
  5. use Encode;
  6. use LWP::UserAgent;
  7. use Mojo::DOM;
  8. use File::Slurp;
  9. use File::Basename qw/basename/;
  10. use File::Path qw/mkpath/;
  11. STDOUT->autoflush(1);
  12. our $wdir = "D:/temp/wallpaper_zol/meinv";
  13. our $main = "http://desk.zol.com.cn";
  14. my $ua = LWP::UserAgent->new( agent => "Mozilla/5.0" );
  15. our @headers = (
  16.         "Host" => "desk.zol.com.cn",
  17.         "User-Agent" => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
  18.     );
  19. mkpath $wdir unless -e $wdir;
  20. chdir $wdir;
  21. # 获取所有主题链接
  22. my @items;
  23. my $iter = 1;
  24. while ( get_item( $main ."/meinv/${iter}.html", \@items ) >= 1 )
  25. {
  26.     $iter++;
  27. }
  28. # 遍历页面、提取图片
  29. my $idx = 0;
  30. for my $item ( @items )
  31. {
  32.     printf "[%03d/%d] %s %s\n",  $idx++ , $#items+1, $item->{link}, $item->{title};
  33.     get_pages( $item->{link}, $item->{title} );
  34. }
  35. sub get_item
  36. {
  37.     our ($main, @headers);
  38.     my ( $link, $ref ) = @_;
  39.     # 重建 UserAgent 对象
  40.     my $ua = LWP::UserAgent->new();
  41.     my $res = $ua->get($link, @headers);
  42.     my $dom = Mojo::DOM->new( $res->content );
  43.     for my $e ( $dom->find(".photo-list-padding")->each )
  44.     {
  45.         printf "%s %s\n", $e->at("a")->attr("href"), $e->at("span")->attr("title");
  46.         push @$ref, {
  47.                 'link' => $main . $e->at("a")->attr("href"),  
  48.                 'title' => $e->at("span")->attr("title")
  49.                 }
  50.     }
  51.     if ( defined $dom->at("#pageNext") ) { return 1 }
  52.     else {  return 0 }
  53. }
  54. # --- Get each pages of item --- #
  55. sub get_pages
  56. {
  57.     our @headers;
  58.     my ($link, $title) = @_;
  59.     my $res = $ua->get( $link, @headers );
  60.     my $dom = Mojo::DOM->new( $res->content );
  61.     my $path = "${wdir}/${title}";
  62.     mkpath $path unless -e $path;
  63.     chdir $path;
  64.     # 图片数量
  65.     my $pics = $dom->at(".photo-list-box li i")->text;
  66.     $pics=~s/[^\d]//;  #去除斜杠
  67.     my @files = glob "*.jpg";
  68.     if ( $#files+1 == $pics ) {
  69.         printf "Images already exist\n";
  70.         return;
  71.     }
  72.     for my $e ($dom->find(".photo-list-box a")->each )
  73.     {
  74.         #printf "%s\n", $e->attr("href");
  75.         get_pic( $main . $e->attr("href") );
  76.     }
  77. }
  78. sub get_pic
  79. {
  80.     my ( $link ) = @_;
  81.     # 刷新 UserAgent 对象
  82.     my $ua = LWP::UserAgent->new( timeout => 6 );
  83.     my $res = $ua->get($link);
  84.     my $dom = Mojo::DOM->new($res->content);
  85.     my $pic_url;
  86.     my $pic_name;
  87.     my $sub_url = $dom->at(".wallpaper-down dd a")->attr("href");
  88.     $pic_name = basename($sub_url);
  89.     $pic_name =~ s/\.html/\.jpg/i;
  90.     printf "%s\n", $pic_name;
  91.     return if ( -e $pic_name );
  92.     my $retry = 0;
  93.     do
  94.     {
  95.         $res = $ua->get( "${main}${sub_url}" );
  96.         if    ( $retry > 0 and $retry < 5 ) { print "retry times: $retry\n"; }
  97.         elsif ( $retry > 5 )                { print "False\n"; return }   
  98.         $retry++;
  99.     }
  100.     until ( $res->is_success );
  101.     $dom = Mojo::DOM->new( $res->content );
  102.     $ua->mirror( $dom->at("")->attr("src"), $pic_name );
  103. }
复制代码

作者: 523066680    时间: 2018-11-3 10:44     标题: 使用 Mojo::UserAgent

本帖最后由 523066680 于 2018-11-3 10:48 编辑

改用Mojo::UserAgent,似乎默认keep_alive,效率高好多,这次下载“美食”系列图片






欢迎光临 批处理之家 (http://www.bathome.net/) Powered by Discuz! 7.2