[Perl]批量下载美女壁纸(ZOL桌面壁纸)
[i=s] 本帖最后由 523066680 于 2018-11-1 11:23 编辑 [/i]最近需要素材便写了,没有加入多线程,就这样按顺序抓~
如果因为某种原因中断了,重新开始,会判断已完成的部分节省时间。
keep_alive 打开后好像会导致后续页面访问不了,所以没开。
运行环境: [url=http://strawberryperl.com/releases.html]Straberry Perl 5.24[/url][code]=info
Author: 523066680/vicyang
Date: 2018-11
=cut
use Encode;
use LWP::UserAgent;
use Mojo::DOM;
use File::Slurp;
use File::Basename qw/basename/;
use File::Path qw/mkpath/;
STDOUT->autoflush(1);
our $wdir = "D:/temp/wallpaper_zol/meinv";
our $main = "http://desk.zol.com.cn";
my $ua = LWP::UserAgent->new( agent => "Mozilla/5.0" );
our @headers = (
"Host" => "desk.zol.com.cn",
"User-Agent" => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
);
mkpath $wdir unless -e $wdir;
chdir $wdir;
# 获取所有主题链接
my @items;
my $iter = 1;
while ( get_item( $main ."/meinv/${iter}.html", \@items ) >= 1 )
{
$iter++;
}
# 遍历页面、提取图片
my $idx = 0;
for my $item ( @items )
{
printf "[%03d/%d] %s %s\n", $idx++ , $#items+1, $item->{link}, $item->{title};
get_pages( $item->{link}, $item->{title} );
}
sub get_item
{
our ($main, @headers);
my ( $link, $ref ) = @_;
# 重建 UserAgent 对象
my $ua = LWP::UserAgent->new();
my $res = $ua->get($link, @headers);
my $dom = Mojo::DOM->new( $res->content );
for my $e ( $dom->find(".photo-list-padding")->each )
{
printf "%s %s\n", $e->at("a")->attr("href"), $e->at("span")->attr("title");
push @$ref, {
'link' => $main . $e->at("a")->attr("href"),
'title' => $e->at("span")->attr("title")
}
}
if ( defined $dom->at("#pageNext") ) { return 1 }
else { return 0 }
}
# --- Get each pages of item --- #
sub get_pages
{
our @headers;
my ($link, $title) = @_;
my $res = $ua->get( $link, @headers );
my $dom = Mojo::DOM->new( $res->content );
my $path = "${wdir}/${title}";
mkpath $path unless -e $path;
chdir $path;
# 图片数量
my $pics = $dom->at(".photo-list-box li i")->text;
$pics=~s/[^\d]//; #去除斜杠
my @files = glob "*.jpg";
if ( $#files+1 == $pics ) {
printf "Images already exist\n";
return;
}
for my $e ($dom->find(".photo-list-box a")->each )
{
#printf "%s\n", $e->attr("href");
get_pic( $main . $e->attr("href") );
}
}
sub get_pic
{
my ( $link ) = @_;
# 刷新 UserAgent 对象
my $ua = LWP::UserAgent->new( timeout => 6 );
my $res = $ua->get($link);
my $dom = Mojo::DOM->new($res->content);
my $pic_url;
my $pic_name;
my $sub_url = $dom->at(".wallpaper-down dd a")->attr("href");
$pic_name = basename($sub_url);
$pic_name =~ s/\.html/\.jpg/i;
printf "%s\n", $pic_name;
return if ( -e $pic_name );
my $retry = 0;
do
{
$res = $ua->get( "${main}${sub_url}" );
if ( $retry > 0 and $retry < 5 ) { print "retry times: $retry\n"; }
elsif ( $retry > 5 ) { print "False\n"; return }
$retry++;
}
until ( $res->is_success );
$dom = Mojo::DOM->new( $res->content );
$ua->mirror( $dom->at("")->attr("src"), $pic_name );
}[/code]
使用 Mojo::UserAgent
[i=s] 本帖最后由 523066680 于 2018-11-3 10:48 编辑 [/i]改用Mojo::UserAgent,似乎默认keep_alive,效率高好多,这次下载“美食”系列图片
[list][list][table=80%, #f8f8f8][tr][td][font=consolas][size=2][font=consolas][size=3][color=#555555]=[/color]info
Author[color=#555555]:[/color] [color=#666666]523066680[/color][color=#555555]/[/color]vicyang
Date[color=#555555]:[/color] [color=#666666]2018[/color][color=#555555]-[/color][color=#666666]11[/color]
=cut
[color=#008000][b]use[/b][/color] Encode[color=#555555];[/color]
[color=#008000][b]use[/b][/color] Mojo[color=#555555]::[/color]UserAgent[color=#555555];[/color]
[color=#008000][b]use[/b][/color] Mojo[color=#555555]::[/color]DOM[color=#555555];[/color]
[color=#008000][b]use[/b][/color] File[color=#555555]::[/color]Slurp[color=#555555];[/color]
[color=#008000][b]use[/b][/color] File[color=#555555]::[/color]Basename qw[color=#555555]/basename/[/color][color=#555555];[/color]
[color=#008000][b]use[/b][/color] File[color=#555555]::[/color]Path qw[color=#555555]/mkpath/[/color][color=#555555];[/color]
STDOUT[color=#555555]->[/color][color=#555555]autoflush[/color][color=#555555]([/color][color=#666666]1[/color][color=#555555]);[/color]
[color=#b00040]our[/color] [color=#b00040]$theme[/color] [color=#555555]=[/color] [color=#ba2121]"meishi"[/color][color=#555555];[/color]
[color=#b00040]our[/color] [color=#b00040]$wdir[/color] [color=#555555]=[/color] [color=#ba2121]"F:/Wallpaper/zol/[/color][color=#ba2121]$theme[/color][color=#ba2121]"[/color][color=#555555];[/color]
[color=#b00040]our[/color] [color=#b00040]$main[/color] [color=#555555]=[/color] [color=#ba2121]"http://desk.zol.com.cn"[/color][color=#555555];[/color]
[color=#b00040]our[/color] [color=#b00040]$ua[/color] [color=#555555]=[/color] Mojo[color=#555555]::[/color]UserAgent[color=#555555]->[/color][color=#555555]new[/color][color=#555555]();[/color]
[color=#b00040]our[/color] [color=#b00040]@headers[/color] [color=#555555]= ([/color]
[color=#ba2121]"Host"[/color] [color=#555555]=>[/color] [color=#ba2121]"desk.zol.com.cn"[/color][color=#555555],[/color]
[color=#ba2121]"User-Agent"[/color] [color=#555555]=>[/color] [color=#ba2121]"Firefox/63.0"[/color][color=#555555],[/color]
[color=#555555]);[/color]
mkpath [color=#b00040]$wdir[/color] [color=#008000][b]unless[/b][/color] [color=#555555]-[/color]e [color=#b00040]$wdir[/color][color=#555555];[/color]
[color=#b00040]chdir[/color] [color=#b00040]$wdir[/color][color=#555555];[/color]
获取所有主题链接
[color=#b00040]my[/color] [color=#b00040]@items[/color][color=#555555];[/color]
[color=#b00040]my[/color] [color=#b00040]$iter[/color] [color=#555555]=[/color] [color=#666666]1[/color][color=#555555];[/color]
[color=#008000][b]while[/b][/color] [color=#555555]([/color] get_item[color=#555555]([/color] [color=#b00040]$main[/color] [color=#555555].[/color][color=#ba2121]"/${theme}/${iter}.html"[/color][color=#555555],[/color] \[color=#b00040]@items[/color] [color=#555555]) >=[/color] [color=#666666]1[/color] [color=#555555])[/color]
[color=#555555]{[/color]
[color=#b00040]$iter[/color][color=#555555]++;[/color]
[color=#555555]}[/color]
遍历页面、提取图片
[color=#b00040]my[/color] [color=#b00040]$idx[/color] [color=#555555]=[/color] [color=#666666]1[/color][color=#555555];[/color]
[color=#008000][b]for[/b][/color] [color=#b00040]my[/color] [color=#b00040]$item[/color] [color=#555555]([/color] [color=#b00040]@items[/color] [color=#555555])[/color]
[color=#555555]{[/color]
[color=#b00040]printf[/color] [color=#ba2121]"[[/color][color=#ba2121]%03d[/color][color=#ba2121]/[/color][color=#ba2121]%d[/color][color=#ba2121]][/color] [color=#ba2121]%s[/color] [color=#ba2121][/color][color=#ba2121]%s[/color][color=#ba2121][/color][color=#ba2121]\n[/color][color=#ba2121]"[/color][color=#555555],[/color] [color=#b00040]$idx[/color][color=#555555]++ ,[/color] [color=#b00040]$#items[/color][color=#555555]+[/color][color=#666666]1[/color][color=#555555],[/color] [color=#b00040]$item[/color][color=#555555]->{[/color][color=#b00040]link[/color][color=#555555]},[/color] [color=#b00040]$item[/color][color=#555555]->{[/color]title[color=#555555]};[/color]
get_pages[color=#555555]([/color] [color=#b00040]$item[/color][color=#555555]->{[/color][color=#b00040]link[/color][color=#555555]},[/color] [color=#b00040]$item[/color][color=#555555]->{[/color]title[color=#555555]} );[/color]
[color=#555555]}[/color]
[color=#008000][b]sub[/b][/color] get_item
[color=#555555]{[/color]
[color=#b00040]my[/color] [color=#555555]([/color] [color=#b00040]$link[/color][color=#555555],[/color] [color=#b00040]$ref[/color] [color=#555555]) =[/color] [color=#b00040]@_[/color][color=#555555];[/color]
[color=#b00040]my[/color] [color=#b00040]$res[/color] [color=#555555]=[/color] try_to_get[color=#555555]([/color] [color=#b00040]$link[/color] [color=#555555]);[/color]
[color=#b00040]my[/color] [color=#b00040]$dom[/color] [color=#555555]=[/color] [color=#b00040]$res[/color][color=#555555]->[/color][color=#555555]dom[/color][color=#555555];[/color]
[color=#008000][b]for[/b][/color] [color=#b00040]my[/color] [color=#b00040]$e[/color] [color=#555555]([/color] [color=#b00040]$dom[/color][color=#555555]->[/color][color=#555555]find[/color][color=#555555]([/color][color=#ba2121]".photo-list-padding"[/color][color=#555555])->[/color][color=#b00040]each[/color] [color=#555555])[/color]
[color=#555555]{[/color]
[color=#b00040]printf[/color] [color=#ba2121]"[/color][color=#ba2121]%s[/color] [color=#ba2121][/color][color=#ba2121]%s[/color][color=#ba2121][/color][color=#ba2121]\n[/color][color=#ba2121]"[/color][color=#555555],[/color] [color=#b00040]$e[/color][color=#555555]->[/color][color=#555555]at[/color][color=#555555]([/color][color=#ba2121]"a"[/color][color=#555555])->[/color][color=#555555]attr[/color][color=#555555]([/color][color=#ba2121]"href"[/color][color=#555555]),[/color] [color=#b00040]$e[/color][color=#555555]->[/color][color=#555555]at[/color][color=#555555]([/color][color=#ba2121]"span"[/color][color=#555555])->[/color][color=#555555]attr[/color][color=#555555]([/color][color=#ba2121]"title"[/color][color=#555555]);[/color]
[color=#b00040]push[/color] @[color=#b00040]$ref[/color][color=#555555], {[/color][color=#ba2121]'link'[/color] [color=#555555]=>[/color] [color=#b00040]$main[/color] [color=#555555].[/color] [color=#b00040]$e[/color][color=#555555]->[/color][color=#555555]at[/color][color=#555555]([/color][color=#ba2121]"a"[/color][color=#555555])->[/color][color=#555555]attr[/color][color=#555555]([/color][color=#ba2121]"href"[/color][color=#555555]),[/color]
[color=#ba2121]'title'[/color] [color=#555555]=>[/color] [color=#b00040]$e[/color][color=#555555]->[/color][color=#555555]at[/color][color=#555555]([/color][color=#ba2121]"span"[/color][color=#555555])->[/color][color=#555555]attr[/color][color=#555555]([/color][color=#ba2121]"title"[/color][color=#555555]) };[/color]
[color=#555555]}[/color]
[color=#408080][i]# 判断是否为最后一页[/i][/color]
[color=#008000][b]if[/b][/color] [color=#555555]([/color] [color=#b00040]defined[/color] [color=#b00040]$dom[/color][color=#555555]->[/color][color=#555555]at[/color][color=#555555]([/color][color=#ba2121]"#pageNext"[/color][color=#555555]) ) {[/color] [color=#008000][b]return[/b][/color] [color=#666666]1[/color] [color=#555555]}[/color]
[color=#008000][b]else[/b][/color] [color=#555555]{[/color] [color=#008000][b]return[/b][/color] [color=#666666]0[/color] [color=#555555]}[/color]
[color=#555555]}[/color]
[color=#555555]---[/color] Get [color=#b00040]each[/color] pages of item [color=#555555]---[/color] [color=#408080][i]#[/i][/color]
[color=#008000][b]sub[/b][/color] get_pages
[color=#555555]{[/color]
[color=#b00040]my[/color] [color=#555555]([/color][color=#b00040]$link[/color][color=#555555],[/color] [color=#b00040]$title[/color][color=#555555]) =[/color] [color=#b00040]@_[/color][color=#555555];[/color]
[color=#b00040]my[/color] [color=#b00040]$res[/color] [color=#555555]=[/color] try_to_get[color=#555555]([/color] [color=#b00040]$link[/color] [color=#555555]);[/color]
[color=#b00040]my[/color] [color=#b00040]$dom[/color] [color=#555555]=[/color] [color=#b00040]$res[/color][color=#555555]->[/color][color=#555555]dom[/color][color=#555555];[/color]
[color=#b00040]my[/color] [color=#b00040]$path[/color] [color=#555555]=[/color] [color=#ba2121]"${wdir}/${title}"[/color][color=#555555];[/color]
mkpath [color=#b00040]$path[/color] [color=#008000][b]unless[/b][/color] [color=#555555]-[/color]e [color=#b00040]$path[/color][color=#555555];[/color]
[color=#b00040]chdir[/color] [color=#b00040]$path[/color][color=#555555];[/color]
[color=#408080][i]# 图片数量[/i][/color]
[color=#b00040]my[/color] [color=#b00040]$pics[/color] [color=#555555]=[/color] [color=#b00040]$dom[/color][color=#555555]->[/color][color=#555555]at[/color][color=#555555]([/color][color=#ba2121]".photo-list-box li i"[/color][color=#555555])->[/color][color=#555555]text[/color][color=#555555];[/color]
[color=#b00040]$pics[/color][color=#555555]=~[/color][color=#555555]s/[^\d]//[/color][color=#555555];[/color] [color=#408080][i]#去除斜杠[/i][/color]
[color=#b00040]my[/color] [color=#b00040]@files[/color] [color=#555555]=[/color] [color=#b00040]glob[/color] [color=#ba2121]"*.jpg"[/color][color=#555555];[/color]
[color=#008000][b]if[/b][/color] [color=#555555]([/color] [color=#b00040]$#files[/color][color=#555555]+[/color][color=#666666]1[/color] [color=#555555]==[/color] [color=#b00040]$pics[/color] [color=#555555]) {[/color]
[color=#b00040]printf[/color] [color=#ba2121]"Images already exist[/color][color=#ba2121]\n[/color][color=#ba2121]"[/color][color=#555555];[/color]
[color=#008000][b]return[/b][/color][color=#555555];[/color]
[color=#555555]}[/color]
[color=#008000][b]for[/b][/color] [color=#b00040]my[/color] [color=#b00040]$e[/color] [color=#555555]([/color][color=#b00040]$dom[/color][color=#555555]->[/color][color=#555555]find[/color][color=#555555]([/color][color=#ba2121]".photo-list-box a"[/color][color=#555555])->[/color][color=#b00040]each[/color] [color=#555555])[/color]
[color=#555555]{[/color]
[color=#408080][i]#printf "%s\n", $e->attr("href");[/i][/color]
get_pic[color=#555555]([/color] [color=#b00040]$main[/color] [color=#555555].[/color] [color=#b00040]$e[/color][color=#555555]->[/color][color=#555555]attr[/color][color=#555555]([/color][color=#ba2121]"href"[/color][color=#555555]) );[/color]
[color=#555555]}[/color]
[color=#555555]}[/color]
[color=#008000][b]sub[/b][/color] get_pic
[color=#555555]{[/color]
[color=#b00040]my[/color] [color=#555555]([/color] [color=#b00040]$link[/color] [color=#555555]) =[/color] [color=#b00040]@_[/color][color=#555555];[/color]
[color=#b00040]my[/color] [color=#b00040]$res[/color] [color=#555555]=[/color] try_to_get[color=#555555]([/color] [color=#b00040]$link[/color] [color=#555555]);[/color]
[color=#008000][b]return unless[/b][/color] [color=#555555]([/color][color=#b00040]defined[/color] [color=#b00040]$res[/color][color=#555555]);[/color]
[color=#b00040]my[/color] [color=#b00040]$dom[/color] [color=#555555]=[/color] [color=#b00040]$res[/color][color=#555555]->[/color][color=#555555]dom[/color][color=#555555];[/color]
[color=#b00040]my[/color] [color=#b00040]$pic_url[/color][color=#555555];[/color]
[color=#b00040]my[/color] [color=#b00040]$pic_name[/color][color=#555555];[/color]
[color=#b00040]my[/color] [color=#b00040]$obj[/color] [color=#555555]=[/color] [color=#b00040]$dom[/color][color=#555555]->[/color][color=#555555]at[/color][color=#555555]([/color][color=#ba2121]".wallpaper-down dd a"[/color][color=#555555]);[/color]
[color=#b00040]my[/color] [color=#b00040]$sub_url[/color][color=#555555];[/color]
[color=#008000][b]while[/b][/color] [color=#555555]([/color][color=#666666]1[/color][color=#555555])[/color]
[color=#555555]{[/color]
[color=#b00040]$sub_url[/color] [color=#555555]=[/color] [color=#b00040]$obj[/color][color=#555555]->[/color][color=#555555]attr[/color][color=#555555]([/color][color=#ba2121]"href"[/color][color=#555555]);[/color]
[color=#408080][i]# 某些图片没有提供指定分辨率的链接[/i][/color]
[color=#008000][b]if[/b][/color] [color=#555555]([/color] [color=#b00040]$sub_url[/color] [color=#555555]!~[/color][color=#555555]/\.html/[/color] [color=#555555]) {[/color]
[color=#b00040]printf[/color] [color=#ba2121]"Did not found picture url, skip[/color] [color=#ba2121]%s[/color][color=#ba2121][/color][color=#ba2121]\n[/color][color=#ba2121]"[/color][color=#555555],[/color] [color=#b00040]$sub_url[/color][color=#555555];[/color]
[color=#008000][b]return[/b][/color][color=#555555];[/color]
[color=#555555]}[/color]
[color=#b00040]$pic_name[/color] [color=#555555]=[/color] basename[color=#555555]([/color][color=#b00040]$sub_url[/color][color=#555555]);[/color]
[color=#b00040]$pic_name[/color] [color=#555555]=~[/color] [color=#555555]s/\.html/\.jpg/i[/color][color=#555555];[/color]
[color=#b00040]printf[/color] [color=#ba2121]"[/color][color=#ba2121]%s[/color][color=#ba2121][/color][color=#ba2121]\n[/color][color=#ba2121]"[/color][color=#555555],[/color] [color=#b00040]$pic_name[/color][color=#555555];[/color]
[color=#008000][b]return if[/b][/color] [color=#555555]( -[/color]e [color=#b00040]$pic_name[/color] [color=#555555]);[/color]
[color=#b00040]my[/color] [color=#b00040]$res[/color] [color=#555555]=[/color] try_to_get[color=#555555]([/color] [color=#ba2121]"${main}${sub_url}"[/color] [color=#555555]);[/color]
[color=#008000][b]return unless[/b][/color] [color=#555555]([/color][color=#b00040]defined[/color] [color=#b00040]$res[/color][color=#555555]);[/color]
[color=#b00040]my[/color] [color=#b00040]$dom[/color] [color=#555555]=[/color] [color=#b00040]$res[/color][color=#555555]->[/color][color=#555555]dom[/color][color=#555555];[/color]
[color=#b00040]my[/color] [color=#b00040]$res[/color] [color=#555555]=[/color] [color=#b00040]$ua[/color][color=#555555]->[/color][color=#555555]get[/color][color=#555555]([/color] [color=#b00040]$dom[/color][color=#555555]->[/color][color=#555555]at[/color][color=#555555]([/color][color=#ba2121]""[/color][color=#555555])->[/color][color=#555555]attr[/color][color=#555555]([/color][color=#ba2121]"src"[/color][color=#555555]) )->[/color][color=#555555]result[/color][color=#555555];[/color]
[color=#408080][i]# 如果下载失败就选择下一个分辨率的图片[/i][/color]
[color=#008000][b]if[/b][/color] [color=#555555]([/color] [color=#b00040]$res[/color][color=#555555]->[/color][color=#555555]code[/color] [color=#555555]==[/color] [color=#666666]502[/color] [color=#555555]) {[/color] [color=#b00040]$obj[/color] [color=#555555]=[/color] [color=#b00040]$obj[/color][color=#555555]->[/color][color=#008000][b]next[/b][/color][color=#555555];[/color] [color=#008000][b]next[/b][/color][color=#555555]; }[/color]
write_file[color=#555555]([/color] [color=#b00040]$pic_name[/color][color=#555555], {[/color][color=#b00040]binmode[/color][color=#555555]=>[/color][color=#ba2121]":raw"[/color][color=#555555]},[/color] [color=#b00040]$res[/color][color=#555555]->[/color][color=#555555]body[/color] [color=#555555]);[/color]
[color=#008000][b]last[/b][/color][color=#555555];[/color]
[color=#555555]}[/color]
[color=#555555]}[/color]
[color=#008000][b]sub[/b][/color] try_to_get
[color=#555555]{[/color]
[color=#b00040]our[/color] [color=#555555]([/color][color=#b00040]$ua[/color][color=#555555],[/color] [color=#b00040]@headers[/color][color=#555555]);[/color]
[color=#b00040]my[/color] [color=#b00040]$link[/color] [color=#555555]=[/color] [color=#b00040]shift[/color][color=#555555];[/color]
[color=#b00040]my[/color] [color=#b00040]$res[/color][color=#555555];[/color]
[color=#b00040]my[/color] [color=#b00040]$retry[/color] [color=#555555]=[/color] [color=#666666]0[/color][color=#555555];[/color]
[color=#008000][b]do[/b][/color]
[color=#555555]{[/color]
[color=#b00040]$res[/color] [color=#555555]=[/color] [color=#b00040]$ua[/color][color=#555555]->[/color][color=#555555]get[/color][color=#555555]([/color] [color=#b00040]$link[/color] [color=#555555])->[/color][color=#555555]result[/color][color=#555555];[/color]
[color=#008000][b]if[/b][/color] [color=#555555]([/color] [color=#b00040]$retry[/color] [color=#555555]>[/color] [color=#666666]0[/color] [color=#b00040]and[/color] [color=#b00040]$retry[/color] [color=#555555]<[/color] [color=#666666]5[/color] [color=#555555]) {[/color] [color=#b00040]print[/color] [color=#ba2121]"Retry times:[/color] [color=#ba2121]$retry[/color][color=#ba2121][/color][color=#ba2121]\n[/color][color=#ba2121]"[/color][color=#555555]; }[/color]
[color=#008000][b]elsif[/b][/color] [color=#555555]([/color] [color=#b00040]$retry[/color] [color=#555555]>[/color] [color=#666666]5[/color] [color=#555555]) {[/color] [color=#b00040]print[/color] [color=#ba2121]"False[/color][color=#ba2121]\n[/color][color=#ba2121]"[/color][color=#555555];[/color] [color=#008000][b]return[/b][/color] [color=#b00040]undef[/color] [color=#555555]}[/color]
[color=#b00040]$retry[/color][color=#555555]++;[/color]
[color=#555555]}[/color]
[color=#008000][b]until[/b][/color] [color=#555555]([/color] [color=#b00040]$res[/color][color=#555555]->[/color][color=#555555]is_success[/color] [color=#555555]);[/color]
[color=#008000][b]return[/b][/color] [color=#b00040]$res[/color][color=#555555];[/color]
[color=#555555]}[/color][/size][/font][/size][/font][/b][/td][/tr][/table][/list][/list]
页:
[1]