[新手上路]批处理新手入门导读[视频教程]批处理基础视频教程[视频教程]VBS基础视频教程[批处理精品]批处理版照片整理器
[批处理精品]纯批处理备份&还原驱动[批处理精品]CMD命令50条不能说的秘密[在线下载]第三方命令行工具[在线帮助]VBScript / JScript 在线参考
返回列表 发帖

[原创代码] [Perl]批量下载美女壁纸(ZOL桌面壁纸)

本帖最后由 523066680 于 2018-11-1 11:23 编辑

最近需要素材便写了,没有加入多线程,就这样按顺序抓~
如果因为某种原因中断了,重新开始,会判断已完成的部分节省时间。

keep_alive 打开后好像会导致后续页面访问不了,所以没开。

运行环境: Straberry Perl 5.24
  1. =info
  2.     Author: 523066680/vicyang
  3.     Date: 2018-11
  4. =cut
  5. use Encode;
  6. use LWP::UserAgent;
  7. use Mojo::DOM;
  8. use File::Slurp;
  9. use File::Basename qw/basename/;
  10. use File::Path qw/mkpath/;
  11. STDOUT->autoflush(1);
  12. our $wdir = "D:/temp/wallpaper_zol/meinv";
  13. our $main = "http://desk.zol.com.cn";
  14. my $ua = LWP::UserAgent->new( agent => "Mozilla/5.0" );
  15. our @headers = (
  16.         "Host" => "desk.zol.com.cn",
  17.         "User-Agent" => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
  18.     );
  19. mkpath $wdir unless -e $wdir;
  20. chdir $wdir;
  21. # 获取所有主题链接
  22. my @items;
  23. my $iter = 1;
  24. while ( get_item( $main ."/meinv/${iter}.html", \@items ) >= 1 )
  25. {
  26.     $iter++;
  27. }
  28. # 遍历页面、提取图片
  29. my $idx = 0;
  30. for my $item ( @items )
  31. {
  32.     printf "[%03d/%d] %s %s\n",  $idx++ , $#items+1, $item->{link}, $item->{title};
  33.     get_pages( $item->{link}, $item->{title} );
  34. }
  35. sub get_item
  36. {
  37.     our ($main, @headers);
  38.     my ( $link, $ref ) = @_;
  39.     # 重建 UserAgent 对象
  40.     my $ua = LWP::UserAgent->new();
  41.     my $res = $ua->get($link, @headers);
  42.     my $dom = Mojo::DOM->new( $res->content );
  43.     for my $e ( $dom->find(".photo-list-padding")->each )
  44.     {
  45.         printf "%s %s\n", $e->at("a")->attr("href"), $e->at("span")->attr("title");
  46.         push @$ref, {
  47.                 'link' => $main . $e->at("a")->attr("href"),  
  48.                 'title' => $e->at("span")->attr("title")
  49.                 }
  50.     }
  51.     if ( defined $dom->at("#pageNext") ) { return 1 }
  52.     else {  return 0 }
  53. }
  54. # --- Get each pages of item --- #
  55. sub get_pages
  56. {
  57.     our @headers;
  58.     my ($link, $title) = @_;
  59.     my $res = $ua->get( $link, @headers );
  60.     my $dom = Mojo::DOM->new( $res->content );
  61.     my $path = "${wdir}/${title}";
  62.     mkpath $path unless -e $path;
  63.     chdir $path;
  64.     # 图片数量
  65.     my $pics = $dom->at(".photo-list-box li i")->text;
  66.     $pics=~s/[^\d]//;  #去除斜杠
  67.     my @files = glob "*.jpg";
  68.     if ( $#files+1 == $pics ) {
  69.         printf "Images already exist\n";
  70.         return;
  71.     }
  72.     for my $e ($dom->find(".photo-list-box a")->each )
  73.     {
  74.         #printf "%s\n", $e->attr("href");
  75.         get_pic( $main . $e->attr("href") );
  76.     }
  77. }
  78. sub get_pic
  79. {
  80.     my ( $link ) = @_;
  81.     # 刷新 UserAgent 对象
  82.     my $ua = LWP::UserAgent->new( timeout => 6 );
  83.     my $res = $ua->get($link);
  84.     my $dom = Mojo::DOM->new($res->content);
  85.     my $pic_url;
  86.     my $pic_name;
  87.     my $sub_url = $dom->at(".wallpaper-down dd a")->attr("href");
  88.     $pic_name = basename($sub_url);
  89.     $pic_name =~ s/\.html/\.jpg/i;
  90.     printf "%s\n", $pic_name;
  91.     return if ( -e $pic_name );
  92.     my $retry = 0;
  93.     do
  94.     {
  95.         $res = $ua->get( "${main}${sub_url}" );
  96.         if    ( $retry > 0 and $retry < 5 ) { print "retry times: $retry\n"; }
  97.         elsif ( $retry > 5 )                { print "False\n"; return }   
  98.         $retry++;
  99.     }
  100.     until ( $res->is_success );
  101.     $dom = Mojo::DOM->new( $res->content );
  102.     $ua->mirror( $dom->at("")->attr("src"), $pic_name );
  103. }
复制代码

使用 Mojo::UserAgent

本帖最后由 523066680 于 2018-11-3 10:48 编辑

改用Mojo::UserAgent,似乎默认keep_alive,效率高好多,这次下载“美食”系列图片

      =info
          Author: 523066680/vicyang
          Date: 2018-11
      =cut

      use Encode;
      use Mojo::UserAgent;
      use Mojo::DOM;
      use File::Slurp;
      use File::Basename qw/basename/;
      use File::Path qw/mkpath/;
      STDOUT->autoflush(1);

      our $theme = "meishi";
      our $wdir = "F:/Wallpaper/zol/$theme";
      our $main = "http://desk.zol.com.cn";
      our $ua = Mojo::UserAgent->new();
      our @headers = (
              "Host" => "desk.zol.com.cn",
              "User-Agent" => "Firefox/63.0",
          );

      mkpath $wdir unless -e $wdir;
      chdir $wdir;

      获取所有主题链接
      my @items;
      my $iter = 1;
      while ( get_item( $main ."/${theme}/${iter}.html", \@items ) >= 1 )
      {
          $iter++;
      }

      遍历页面、提取图片
      my $idx = 1;
      for my $item ( @items )
      {
          printf "[%03d/%d] %s %s\n",  $idx++ , $#items+1, $item->{link}, $item->{title};
          get_pages( $item->{link}, $item->{title} );
      }

      sub get_item
      {
          my ( $link, $ref ) = @_;
          my $res = try_to_get( $link );
          my $dom = $res->dom;

          for my $e ( $dom->find(".photo-list-padding")->each )
          {
              printf "%s %s\n", $e->at("a")->attr("href"), $e->at("span")->attr("title");
              push @$ref, {'link'  => $main . $e->at("a")->attr("href"),
                           'title' => $e->at("span")->attr("title") };
          }
          # 判断是否为最后一页
          if ( defined $dom->at("#pageNext") ) { return 1 }
          else {  return 0 }
      }

      --- Get each pages of item --- #

      sub get_pages
      {
          my ($link, $title) = @_;
          my $res = try_to_get( $link );
          my $dom = $res->dom;

          my $path = "${wdir}/${title}";
          mkpath $path unless -e $path;
          chdir $path;

          # 图片数量
          my $pics = $dom->at(".photo-list-box li i")->text;
          $pics=~s/[^\d]//;  #去除斜杠

          my @files = glob "*.jpg";
          if ( $#files+1 == $pics ) {
              printf "Images already exist\n";
              return;
          }

          for my $e ($dom->find(".photo-list-box a")->each )
          {
              #printf "%s\n", $e->attr("href");
              get_pic( $main . $e->attr("href") );
          }
      }

      sub get_pic
      {
          my ( $link ) = @_;
          my $res = try_to_get( $link );
          return unless (defined $res);

          my $dom = $res->dom;
          my $pic_url;
          my $pic_name;

          my $obj = $dom->at(".wallpaper-down dd a");
          my $sub_url;

          while (1)
          {
              $sub_url = $obj->attr("href");
              # 某些图片没有提供指定分辨率的链接
              if ( $sub_url !~/\.html/ ) {
                  printf "Did not found picture url, skip %s\n", $sub_url;
                  return;
              }

              $pic_name = basename($sub_url);
              $pic_name =~ s/\.html/\.jpg/i;
              printf "%s\n", $pic_name;
              return if ( -e $pic_name );

              my $res = try_to_get( "${main}${sub_url}" );
              return unless (defined $res);

              my $dom = $res->dom;
              my $res = $ua->get( $dom->at("")->attr("src") )->result;
              
              # 如果下载失败就选择下一个分辨率的图片
              if ( $res->code == 502 ) { $obj = $obj->next; next; }

              write_file( $pic_name, {binmode=>":raw"}, $res->body );
              last;            
          }
      }

      sub try_to_get
      {
          our ($ua, @headers);
          my $link = shift;
          my $res;
          my $retry = 0;
          do
          {
              $res = $ua->get( $link )->result;
              if    ( $retry > 0 and $retry < 5 ) { print "Retry times: $retry\n"; }
              elsif ( $retry > 5 )                { print "False\n"; return undef }
              $retry++;
          }
          until ( $res->is_success );

          return $res;
      }

TOP

返回列表