[福利]多线程爬取某网站图片
[i=s] 本帖最后由 523066680 于 2018-1-19 07:37 编辑 [/i]推荐环境: Strawberry Perl[code]=info
Author: 523066680/vicyang
Date: 2018-01-16
=cut
use Modern::Perl;
use threads;
use threads::shared;
use File::Slurp;
use File::Path qw/make_path/;
use File::Basename;
use Mojo::UserAgent;
use Mojo::DOM;
use Try::Tiny;
use Time::HiRes qw/sleep time/;
use Term::ReadKey;
use IO::Handle;
STDOUT->autoflush(1);
our $main = "http://www.elitebabes.com/model/katherine-a";
our $workdir = "D:/Hex/w4b_models/". basename($main);
make_path $workdir unless ( -e $workdir );
chdir $workdir;
mkdir "links" unless ( -e "links" );
our $progress :shared;
our $total :shared;
our @ths;
our @mission :shared; #共享到线程
our %headers = (
'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Referer' => 'http://www.elitebabes.com',
'Connection' => 'keep-alive',
);
say "Step1";
get_models_data($main);
say "Step2";
@mission = glob "'$workdir/links/*.txt'";
$progress = 0;
$total = scalar( @mission );
#创建线程
grep { push @ths, threads->create( \&thread_func, $_ ) } ( 0 .. 3 );
#等待运行结束
while ( threads->list(threads::running) ) { sleep 0.2 };
#线程分离/结束
grep { $_->detach() } threads->list(threads::all);
quit();
sub get_models_data
{
my ( $page ) = @_;
my ($title, $subpg, $count);
my $ua = Mojo::UserAgent->new();
my $res;
$res = $ua->get( $page, \%headers )->result;
my $dom = $res->dom;
get_info( $page, $dom, "info.txt" );
$count = 0;
for my $e ( $dom->find("ul.gallery-a a")->each )
{
$count++;
$subpg = $e->attr("href");
$title = $e->attr("title");
$title = basename( $subpg ) if ( $title eq "" );
$title =~s/\s+$//; #去掉可能出现的末尾空格
get_piclinks_of_subpage( $ua, $title, $subpg, $count );
}
}
sub get_info
{
my ( $page, $dom, $file ) = @_;
my @data = ($page);
my ($like, $unlike);
return if ( -e $file );
for my $e ( $dom->at("ul.list-a")->find('li')->each ) {
push @data, $e->at('span')->text . $e->text ;
}
$like = $dom->at("span#thelike")->text;
$unlike = $dom->at("span#thedown")->text;
push @data, "like: $like";
push @data, "unlike: $unlike";
write_file( $file, join("\n", @data) );
}
sub get_piclinks_of_subpage
{
my ($ua, $title, $subpage, $count) = @_;
my ( $res, $dom, $href );
my ($times);
my @links;
my $file = "./links/${title}.txt";
if ( -e $file )
{
printf "%03d - %s file already exists\n", $count, $title;
return;
}
$times = 0;
while (1)
{
try { $res = $ua->get($subpage)->result }
catch { printf "getting subpage, retry: %d\n", $times++; };
last if ( defined $res and $res->is_success );
return if ( $times > 10 );
}
$dom = $res->dom;
# find pics
for my $e ( $dom->find(".gallery-b a")->each ) {
push @links, $e->attr("href");
}
# find video
for my $e ( $dom->find("video,.my_video*")->each ) {
push @links, $e->at("source")->attr("src");
}
if ( $#links < 0 ) { printf "fail to get media\n" }
else
{
printf "%03d - %s\n", $count, $title;
write_file( $file, join("\n", @links) );
}
}
sub thread_func
{
our (@mission, @headers, $progress, $total);
my $idx = shift;
my $time_a;
my $target;
my $subfold;
my @links;
my $ua = Mojo::UserAgent->new();
$ua = $ua->max_redirects(5);
$SIG{'BREAK'} = sub { threads->exit() };
while ( $#mission >= 0 )
{
$progress++;
$target = shift @mission;
@links = read_file( $target );
# 获取文件名作为目录名
($subfold, undef, undef) = fileparse($target, qr/\.[^.]*$/);
printf "[%d] [%03d/%03d] %s\n", threads->tid(), $progress, $total, $subfold;
mkdir $subfold unless -e $subfold;
get_pics( threads->tid(), $ua, $subfold, \@links );
}
}
sub get_pics
{
our %headers;
my ($id, $ua, $fold, $links) = @_;
my $res;
my $filepath;
my $times;
for my $e ( @$links )
{
$e=~s/\r?\n//;
next if ( $e !~ /(jpg|png|bmp|gif)/i );
$filepath = $fold ."/". basename($e);
if ( -e $filepath and ( check_jpg_file_tail( $filepath ) == 1 ) )
{
#printf " [%d] %s file exists\n", $id, $filepath;
next;
}
printf " [%d] %s\n", $id, $filepath;
$times = 0;
while (1)
{
try { $res = $ua->get($e, \%headers)->result; }
catch { printf "getting pics, retry: %d\n", $times++; };
last if ( defined $res and $res->is_success );
return if ( $times > 10 );
}
$res->content->asset->move_to( $filepath );
}
}
sub check_jpg_file_tail
{
my $file = shift;
my ($fh, $buff);
open $fh, "<:raw", $file or warn "$!";
seek($fh, -2, 2);
read($fh, $buff, 2);
if ( $buff eq "\xFF\xD9" ) { return 1 }
else { return 0 }
}
sub quit
{
print "Press Any Key to Continue ...";
ReadKey -1;
}[/code]
页:
[1]