|
|
楼主 |
发表于 2018-11-3 10:44:52
|
显示全部楼层
使用 Mojo::UserAgent
本帖最后由 523066680 于 2018-11-3 10:48 编辑
改用Mojo::UserAgent,似乎默认keep_alive,效率高好多,这次下载“美食”系列图片
=info
Author: 523066680/vicyang
Date: 2018-11
=cut
use Encode;
use Mojo::UserAgent;
use Mojo::DOM;
use File::Slurp;
use File::Basename qw/basename/;
use File::Path qw/mkpath/;
STDOUT->autoflush(1);
our $theme = "meishi";
our $wdir = "F:/Wallpaper/zol/$theme";
our $main = "http://desk.zol.com.cn";
our $ua = Mojo::UserAgent->new();
our @headers = (
"Host" => "desk.zol.com.cn",
"User-Agent" => "Firefox/63.0",
);
mkpath $wdir unless -e $wdir;
chdir $wdir;
获取所有主题链接
my @items;
my $iter = 1;
while ( get_item( $main ."/${theme}/${iter}.html", \@items ) >= 1 )
{
$iter++;
}
遍历页面、提取图片
my $idx = 1;
for my $item ( @items )
{
printf "[%03d/%d] %s %s\n", $idx++ , $#items+1, $item->{link}, $item->{title};
get_pages( $item->{link}, $item->{title} );
}
sub get_item
{
my ( $link, $ref ) = @_;
my $res = try_to_get( $link );
my $dom = $res->dom;
for my $e ( $dom->find(".photo-list-padding")->each )
{
printf "%s %s\n", $e->at("a")->attr("href"), $e->at("span")->attr("title");
push @$ref, {'link' => $main . $e->at("a")->attr("href"),
'title' => $e->at("span")->attr("title") };
}
# 判断是否为最后一页
if ( defined $dom->at("#pageNext") ) { return 1 }
else { return 0 }
}
--- Get each pages of item --- #
sub get_pages
{
my ($link, $title) = @_;
my $res = try_to_get( $link );
my $dom = $res->dom;
my $path = "${wdir}/${title}";
mkpath $path unless -e $path;
chdir $path;
# 图片数量
my $pics = $dom->at(".photo-list-box li i")->text;
$pics=~s/[^\d]//; #去除斜杠
my @files = glob "*.jpg";
if ( $#files+1 == $pics ) {
printf "Images already exist\n";
return;
}
for my $e ($dom->find(".photo-list-box a")->each )
{
#printf "%s\n", $e->attr("href");
get_pic( $main . $e->attr("href") );
}
}
sub get_pic
{
my ( $link ) = @_;
my $res = try_to_get( $link );
return unless (defined $res);
my $dom = $res->dom;
my $pic_url;
my $pic_name;
my $obj = $dom->at(".wallpaper-down dd a");
my $sub_url;
while (1)
{
$sub_url = $obj->attr("href");
# 某些图片没有提供指定分辨率的链接
if ( $sub_url !~/\.html/ ) {
printf "Did not found picture url, skip %s\n", $sub_url;
return;
}
$pic_name = basename($sub_url);
$pic_name =~ s/\.html/\.jpg/i;
printf "%s\n", $pic_name;
return if ( -e $pic_name );
my $res = try_to_get( "${main}${sub_url}" );
return unless (defined $res);
my $dom = $res->dom;
my $res = $ua->get( $dom->at("")->attr("src") )->result;
# 如果下载失败就选择下一个分辨率的图片
if ( $res->code == 502 ) { $obj = $obj->next; next; }
write_file( $pic_name, {binmode=>":raw"}, $res->body );
last;
}
}
sub try_to_get
{
our ($ua, @headers);
my $link = shift;
my $res;
my $retry = 0;
do
{
$res = $ua->get( $link )->result;
if ( $retry > 0 and $retry < 5 ) { print "Retry times: $retry\n"; }
elsif ( $retry > 5 ) { print "False\n"; return undef }
$retry++;
}
until ( $res->is_success );
return $res;
} |
|
|