Board logo

标题: [原创代码] [福利]多线程爬取某网站图片 [打印本页]

作者: 523066680    时间: 2018-1-18 19:42     标题: [福利]多线程爬取某网站图片

本帖最后由 523066680 于 2018-1-19 07:37 编辑

推荐环境: Strawberry Perl
  1. =info
  2.     Author: 523066680/vicyang
  3.     Date: 2018-01-16
  4. =cut
  5. use Modern::Perl;
  6. use threads;
  7. use threads::shared;
  8. use File::Slurp;
  9. use File::Path qw/make_path/;
  10. use File::Basename;
  11. use Mojo::UserAgent;
  12. use Mojo::DOM;
  13. use Try::Tiny;
  14. use Time::HiRes qw/sleep time/;
  15. use Term::ReadKey;
  16. use IO::Handle;
  17. STDOUT->autoflush(1);
  18. our $main = "http://www.elitebabes.com/model/katherine-a";
  19. our $workdir = "D:/Hex/w4b_models/". basename($main);
  20. make_path $workdir unless ( -e $workdir );
  21. chdir $workdir;
  22. mkdir "links" unless ( -e "links" );
  23. our $progress :shared;
  24. our $total    :shared;
  25. our @ths;
  26. our @mission :shared; #共享到线程
  27. our %headers = (
  28.     'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
  29.     'Referer' => 'http://www.elitebabes.com',
  30.     'Connection' => 'keep-alive',
  31.     );
  32. say "Step1";
  33. get_models_data($main);
  34. say "Step2";
  35. @mission = glob "'$workdir/links/*.txt'";
  36. $progress = 0;
  37. $total = scalar( @mission );
  38. #创建线程
  39. grep { push @ths, threads->create( \&thread_func, $_ ) } ( 0 .. 3 );
  40. #等待运行结束
  41. while ( threads->list(threads::running) ) { sleep 0.2 };
  42. #线程分离/结束
  43. grep { $_->detach() } threads->list(threads::all);
  44. quit();
  45. sub get_models_data
  46. {
  47.     my ( $page ) = @_;
  48.     my ($title, $subpg, $count);
  49.     my $ua = Mojo::UserAgent->new();
  50.     my $res;
  51.     $res = $ua->get( $page, \%headers )->result;
  52.     my $dom = $res->dom;
  53.     get_info( $page, $dom, "info.txt" );
  54.     $count = 0;
  55.     for my $e ( $dom->find("ul.gallery-a a")->each )
  56.     {
  57.         $count++;
  58.         $subpg = $e->attr("href");
  59.         $title = $e->attr("title");
  60.         $title = basename( $subpg ) if ( $title eq "" );
  61.         $title =~s/\s+$//; #去掉可能出现的末尾空格
  62.         get_piclinks_of_subpage( $ua, $title, $subpg, $count );
  63.     }
  64. }
  65. sub get_info
  66. {
  67.     my ( $page, $dom, $file ) = @_;
  68.     my @data = ($page);
  69.     my ($like, $unlike);
  70.     return if ( -e $file );
  71.     for my $e ( $dom->at("ul.list-a")->find('li')->each  ) {
  72.         push @data, $e->at('span')->text . $e->text ;
  73.     }
  74.     $like = $dom->at("span#thelike")->text;
  75.     $unlike = $dom->at("span#thedown")->text;
  76.     push @data, "like: $like";
  77.     push @data, "unlike: $unlike";
  78.     write_file( $file, join("\n", @data) );
  79. }
  80. sub get_piclinks_of_subpage
  81. {
  82.     my ($ua, $title, $subpage, $count) = @_;
  83.     my ( $res, $dom, $href );
  84.     my ($times);
  85.     my @links;
  86.     my $file = "./links/${title}.txt";
  87.     if ( -e $file )
  88.     {
  89.         printf "%03d - %s file already exists\n", $count, $title;
  90.         return;
  91.     }
  92.     $times = 0;
  93.     while (1)
  94.     {
  95.         try { $res = $ua->get($subpage)->result  }
  96.         catch { printf "getting subpage, retry: %d\n", $times++; };
  97.         last if ( defined $res and $res->is_success );
  98.         return if ( $times > 10 );
  99.     }
  100.     $dom = $res->dom;
  101.     # find pics
  102.     for my $e ( $dom->find(".gallery-b a")->each ) {
  103.         push @links, $e->attr("href");
  104.     }
  105.     # find video
  106.     for my $e ( $dom->find("video,.my_video*")->each ) {
  107.         push @links, $e->at("source")->attr("src");
  108.     }
  109.     if ( $#links < 0 ) { printf "fail to get media\n" }
  110.     else
  111.     {
  112.         printf "%03d - %s\n", $count, $title;
  113.         write_file( $file, join("\n", @links) );
  114.     }
  115. }
  116. sub thread_func
  117. {
  118.     our (@mission, @headers, $progress, $total);
  119.     my $idx = shift;
  120.     my $time_a;
  121.     my $target;
  122.     my $subfold;
  123.     my @links;
  124.     my $ua = Mojo::UserAgent->new();
  125.     $ua = $ua->max_redirects(5);
  126.     $SIG{'BREAK'} = sub { threads->exit() };
  127.     while ( $#mission >= 0 )
  128.     {
  129.         $progress++;
  130.         $target = shift @mission;
  131.         @links = read_file( $target );
  132.         # 获取文件名作为目录名
  133.         ($subfold, undef, undef)  = fileparse($target, qr/\.[^.]*$/);
  134.         printf "[%d] [%03d/%03d] %s\n", threads->tid(), $progress, $total, $subfold;
  135.         mkdir $subfold unless -e $subfold;
  136.         get_pics( threads->tid(), $ua, $subfold, \@links );
  137.     }
  138. }
  139. sub get_pics
  140. {
  141.     our %headers;
  142.     my ($id, $ua, $fold, $links) = @_;
  143.     my $res;
  144.     my $filepath;
  145.     my $times;
  146.     for my $e ( @$links )
  147.     {
  148.         $e=~s/\r?\n//;
  149.         next if ( $e !~ /(jpg|png|bmp|gif)/i );
  150.         $filepath = $fold ."/". basename($e);
  151.         if ( -e $filepath and ( check_jpg_file_tail( $filepath ) == 1 ) )
  152.         {
  153.             #printf "    [%d] %s file exists\n", $id, $filepath;
  154.             next;
  155.         }
  156.         
  157.         printf "    [%d] %s\n", $id, $filepath;
  158.         $times = 0;
  159.         while (1)
  160.         {
  161.             try { $res = $ua->get($e, \%headers)->result;  }
  162.             catch { printf "getting pics, retry: %d\n", $times++; };
  163.             last if ( defined $res and $res->is_success );
  164.             return if ( $times > 10 );
  165.         }
  166.         $res->content->asset->move_to( $filepath );
  167.     }
  168. }
  169. sub check_jpg_file_tail
  170. {
  171.     my $file = shift;
  172.     my ($fh, $buff);
  173.     open $fh, "<:raw", $file or warn "$!";
  174.     seek($fh, -2, 2);
  175.     read($fh, $buff, 2);
  176.     if ( $buff eq "\xFF\xD9" ) { return 1 }
  177.     else { return 0 }
  178. }
  179. sub quit
  180. {
  181.     print "Press Any Key to Continue ...";
  182.     ReadKey -1;
  183. }
复制代码





欢迎光临 批处理之家 (http://bbs.bathome.net/) Powered by Discuz! 7.2