本帖最后由 523066680 于 2018-11-1 11:23 编辑
最近需要素材便写了,没有加入多线程,就这样按顺序抓~
如果因为某种原因中断了,重新开始,会判断已完成的部分节省时间。
keep_alive 打开后好像会导致后续页面访问不了,所以没开。
运行环境: Straberry Perl 5.24- =info
- Author: 523066680/vicyang
- Date: 2018-11
- =cut
-
- use Encode;
- use LWP::UserAgent;
- use Mojo::DOM;
- use File::Slurp;
- use File::Basename qw/basename/;
- use File::Path qw/mkpath/;
- STDOUT->autoflush(1);
-
- our $wdir = "D:/temp/wallpaper_zol/meinv";
- our $main = "http://desk.zol.com.cn";
- my $ua = LWP::UserAgent->new( agent => "Mozilla/5.0" );
- our @headers = (
- "Host" => "desk.zol.com.cn",
- "User-Agent" => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
- );
-
- mkpath $wdir unless -e $wdir;
- chdir $wdir;
-
- # 获取所有主题链接
- my @items;
- my $iter = 1;
- while ( get_item( $main ."/meinv/${iter}.html", \@items ) >= 1 )
- {
- $iter++;
- }
-
- # 遍历页面、提取图片
- my $idx = 0;
- for my $item ( @items )
- {
- printf "[%03d/%d] %s %s\n", $idx++ , $#items+1, $item->{link}, $item->{title};
- get_pages( $item->{link}, $item->{title} );
- }
-
- sub get_item
- {
- our ($main, @headers);
- my ( $link, $ref ) = @_;
- # 重建 UserAgent 对象
- my $ua = LWP::UserAgent->new();
- my $res = $ua->get($link, @headers);
- my $dom = Mojo::DOM->new( $res->content );
-
- for my $e ( $dom->find(".photo-list-padding")->each )
- {
- printf "%s %s\n", $e->at("a")->attr("href"), $e->at("span")->attr("title");
- push @$ref, {
- 'link' => $main . $e->at("a")->attr("href"),
- 'title' => $e->at("span")->attr("title")
- }
- }
-
- if ( defined $dom->at("#pageNext") ) { return 1 }
- else { return 0 }
- }
-
- # --- Get each pages of item --- #
-
- sub get_pages
- {
- our @headers;
- my ($link, $title) = @_;
- my $res = $ua->get( $link, @headers );
- my $dom = Mojo::DOM->new( $res->content );
-
- my $path = "${wdir}/${title}";
- mkpath $path unless -e $path;
- chdir $path;
-
- # 图片数量
- my $pics = $dom->at(".photo-list-box li i")->text;
- $pics=~s/[^\d]//; #去除斜杠
-
- my @files = glob "*.jpg";
- if ( $#files+1 == $pics ) {
- printf "Images already exist\n";
- return;
- }
-
- for my $e ($dom->find(".photo-list-box a")->each )
- {
- #printf "%s\n", $e->attr("href");
- get_pic( $main . $e->attr("href") );
- }
- }
-
- sub get_pic
- {
- my ( $link ) = @_;
- # 刷新 UserAgent 对象
- my $ua = LWP::UserAgent->new( timeout => 6 );
- my $res = $ua->get($link);
- my $dom = Mojo::DOM->new($res->content);
- my $pic_url;
- my $pic_name;
-
- my $sub_url = $dom->at(".wallpaper-down dd a")->attr("href");
- $pic_name = basename($sub_url);
- $pic_name =~ s/\.html/\.jpg/i;
- printf "%s\n", $pic_name;
-
- return if ( -e $pic_name );
-
- my $retry = 0;
- do
- {
- $res = $ua->get( "${main}${sub_url}" );
- if ( $retry > 0 and $retry < 5 ) { print "retry times: $retry\n"; }
- elsif ( $retry > 5 ) { print "False\n"; return }
- $retry++;
- }
- until ( $res->is_success );
-
- $dom = Mojo::DOM->new( $res->content );
- $ua->mirror( $dom->at("")->attr("src"), $pic_name );
- }
复制代码
|