标题: [原创代码] [福利]多线程爬取某网站图片 [打印本页]
作者: 523066680 时间: 2018-1-18 19:42 标题: [福利]多线程爬取某网站图片
本帖最后由 523066680 于 2018-1-19 07:37 编辑
推荐环境: Strawberry Perl- =info
- Author: 523066680/vicyang
- Date: 2018-01-16
- =cut
-
- use Modern::Perl;
- use threads;
- use threads::shared;
- use File::Slurp;
- use File::Path qw/make_path/;
- use File::Basename;
- use Mojo::UserAgent;
- use Mojo::DOM;
- use Try::Tiny;
- use Time::HiRes qw/sleep time/;
- use Term::ReadKey;
- use IO::Handle;
- STDOUT->autoflush(1);
-
- our $main = "http://www.elitebabes.com/model/katherine-a";
- our $workdir = "D:/Hex/w4b_models/". basename($main);
- make_path $workdir unless ( -e $workdir );
- chdir $workdir;
- mkdir "links" unless ( -e "links" );
-
- our $progress :shared;
- our $total :shared;
- our @ths;
- our @mission :shared; #共享到线程
- our %headers = (
- 'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
- 'Referer' => 'http://www.elitebabes.com',
- 'Connection' => 'keep-alive',
- );
-
- say "Step1";
- get_models_data($main);
-
- say "Step2";
- @mission = glob "'$workdir/links/*.txt'";
-
- $progress = 0;
- $total = scalar( @mission );
-
- #创建线程
- grep { push @ths, threads->create( \&thread_func, $_ ) } ( 0 .. 3 );
-
- #等待运行结束
- while ( threads->list(threads::running) ) { sleep 0.2 };
-
- #线程分离/结束
- grep { $_->detach() } threads->list(threads::all);
-
- quit();
-
- sub get_models_data
- {
- my ( $page ) = @_;
- my ($title, $subpg, $count);
-
- my $ua = Mojo::UserAgent->new();
- my $res;
-
- $res = $ua->get( $page, \%headers )->result;
- my $dom = $res->dom;
- get_info( $page, $dom, "info.txt" );
-
- $count = 0;
- for my $e ( $dom->find("ul.gallery-a a")->each )
- {
- $count++;
- $subpg = $e->attr("href");
- $title = $e->attr("title");
- $title = basename( $subpg ) if ( $title eq "" );
- $title =~s/\s+$//; #去掉可能出现的末尾空格
-
- get_piclinks_of_subpage( $ua, $title, $subpg, $count );
- }
- }
-
- sub get_info
- {
- my ( $page, $dom, $file ) = @_;
- my @data = ($page);
- my ($like, $unlike);
-
- return if ( -e $file );
-
- for my $e ( $dom->at("ul.list-a")->find('li')->each ) {
- push @data, $e->at('span')->text . $e->text ;
- }
-
- $like = $dom->at("span#thelike")->text;
- $unlike = $dom->at("span#thedown")->text;
- push @data, "like: $like";
- push @data, "unlike: $unlike";
- write_file( $file, join("\n", @data) );
- }
-
- sub get_piclinks_of_subpage
- {
- my ($ua, $title, $subpage, $count) = @_;
-
- my ( $res, $dom, $href );
- my ($times);
- my @links;
- my $file = "./links/${title}.txt";
-
- if ( -e $file )
- {
- printf "%03d - %s file already exists\n", $count, $title;
- return;
- }
-
- $times = 0;
- while (1)
- {
- try { $res = $ua->get($subpage)->result }
- catch { printf "getting subpage, retry: %d\n", $times++; };
- last if ( defined $res and $res->is_success );
- return if ( $times > 10 );
- }
-
- $dom = $res->dom;
- # find pics
- for my $e ( $dom->find(".gallery-b a")->each ) {
- push @links, $e->attr("href");
- }
-
- # find video
- for my $e ( $dom->find("video,.my_video*")->each ) {
- push @links, $e->at("source")->attr("src");
- }
-
- if ( $#links < 0 ) { printf "fail to get media\n" }
- else
- {
- printf "%03d - %s\n", $count, $title;
- write_file( $file, join("\n", @links) );
- }
- }
-
- sub thread_func
- {
- our (@mission, @headers, $progress, $total);
- my $idx = shift;
- my $time_a;
- my $target;
- my $subfold;
- my @links;
- my $ua = Mojo::UserAgent->new();
- $ua = $ua->max_redirects(5);
-
- $SIG{'BREAK'} = sub { threads->exit() };
-
- while ( $#mission >= 0 )
- {
- $progress++;
- $target = shift @mission;
- @links = read_file( $target );
-
- # 获取文件名作为目录名
- ($subfold, undef, undef) = fileparse($target, qr/\.[^.]*$/);
- printf "[%d] [%03d/%03d] %s\n", threads->tid(), $progress, $total, $subfold;
- mkdir $subfold unless -e $subfold;
-
- get_pics( threads->tid(), $ua, $subfold, \@links );
- }
- }
-
- sub get_pics
- {
- our %headers;
- my ($id, $ua, $fold, $links) = @_;
- my $res;
- my $filepath;
- my $times;
-
- for my $e ( @$links )
- {
- $e=~s/\r?\n//;
- next if ( $e !~ /(jpg|png|bmp|gif)/i );
- $filepath = $fold ."/". basename($e);
- if ( -e $filepath and ( check_jpg_file_tail( $filepath ) == 1 ) )
- {
- #printf " [%d] %s file exists\n", $id, $filepath;
- next;
- }
-
- printf " [%d] %s\n", $id, $filepath;
- $times = 0;
- while (1)
- {
- try { $res = $ua->get($e, \%headers)->result; }
- catch { printf "getting pics, retry: %d\n", $times++; };
- last if ( defined $res and $res->is_success );
- return if ( $times > 10 );
- }
-
- $res->content->asset->move_to( $filepath );
- }
- }
-
- sub check_jpg_file_tail
- {
- my $file = shift;
- my ($fh, $buff);
- open $fh, "<:raw", $file or warn "$!";
- seek($fh, -2, 2);
- read($fh, $buff, 2);
- if ( $buff eq "\xFF\xD9" ) { return 1 }
- else { return 0 }
- }
-
- sub quit
- {
- print "Press Any Key to Continue ...";
- ReadKey -1;
- }
复制代码
欢迎光临 批处理之家 (http://bbs.bathome.net/) |
Powered by Discuz! 7.2 |