[福利]多线程爬取某网站图片(页 1) - Perl - 批处理之家 BAT,CMD,批处理,PowerShell,VBS,DOS

523066680 发表于 2018-1-18 19:42

[福利]多线程爬取某网站图片

[i=s] 本帖最后由 523066680 于 2018-1-19 07:37 编辑 [/i]

推荐环境： Strawberry Perl[code]=info
Author: 523066680/vicyang
Date: 2018-01-16
=cut

use Modern::Perl;
use threads;
use threads::shared;
use File::Slurp;
use File::Path qw/make_path/;
use File::Basename;
use Mojo::UserAgent;
use Mojo::DOM;
use Try::Tiny;
use Time::HiRes qw/sleep time/;
use Term::ReadKey;
use IO::Handle;
STDOUT->autoflush(1);

our $main = "http://www.elitebabes.com/model/katherine-a";
our $workdir = "D:/Hex/w4b_models/". basename($main);
make_path $workdir unless ( -e $workdir );
chdir $workdir;
mkdir "links" unless ( -e "links" );

our $progress :shared;
our $total :shared;
our @ths;
our @mission :shared; #共享到线程
our %headers = (
'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Referer' => 'http://www.elitebabes.com',
'Connection' => 'keep-alive',
);

say "Step1";
get_models_data($main);

say "Step2";
@mission = glob "'$workdir/links/*.txt'";

$progress = 0;
$total = scalar( @mission );

#创建线程
grep { push @ths, threads->create( \&thread_func, $_ ) } ( 0 .. 3 );

#等待运行结束
while ( threads->list(threads::running) ) { sleep 0.2 };

#线程分离/结束
grep { $_->detach() } threads->list(threads::all);

quit();

sub get_models_data
{
my ( $page ) = @_;
my ($title, $subpg, $count);

my $ua = Mojo::UserAgent->new();
my $res;

$res = $ua->get( $page, \%headers )->result;
my $dom = $res->dom;
get_info( $page, $dom, "info.txt" );

$count = 0;
for my $e ( $dom->find("ul.gallery-a a")->each )
{
      $count++;
      $subpg = $e->attr("href");
      $title = $e->attr("title");
      $title = basename( $subpg ) if ( $title eq "" );
      $title =~s/\s+$//; #去掉可能出现的末尾空格

      get_piclinks_of_subpage( $ua, $title, $subpg, $count );
}
}

sub get_info
{
my ( $page, $dom, $file ) = @_;
my @data = ($page);
my ($like, $unlike);

return if ( -e $file );

for my $e ( $dom->at("ul.list-a")->find('li')->each  ) {
      push @data, $e->at('span')->text . $e->text ;
}

$like = $dom->at("span#thelike")->text;
$unlike = $dom->at("span#thedown")->text;
push @data, "like: $like";
push @data, "unlike: $unlike";
write_file( $file, join("\n", @data) );
}

sub get_piclinks_of_subpage
{
my ($ua, $title, $subpage, $count) = @_;

my ( $res, $dom, $href );
my ($times);
my @links;
my $file = "./links/${title}.txt";

if ( -e $file )
{
      printf "%03d - %s file already exists\n", $count, $title;
      return;
}

$times = 0;
while (1)
{
      try { $res = $ua->get($subpage)->result  }
      catch { printf "getting subpage, retry: %d\n", $times++; };
      last if ( defined $res and $res->is_success );
      return if ( $times > 10 );
}

$dom = $res->dom;
# find pics
for my $e ( $dom->find(".gallery-b a")->each ) {
      push @links, $e->attr("href");
}

# find video
for my $e ( $dom->find("video,.my_video*")->each ) {
      push @links, $e->at("source")->attr("src");
}

if ( $#links < 0 ) { printf "fail to get media\n" }
else
{
      printf "%03d - %s\n", $count, $title;
      write_file( $file, join("\n", @links) );
}
}

sub thread_func
{
our (@mission, @headers, $progress, $total);
my $idx = shift;
my $time_a;
my $target;
my $subfold;
my @links;
my $ua = Mojo::UserAgent->new();
$ua = $ua->max_redirects(5);

$SIG{'BREAK'} = sub { threads->exit() };

while ( $#mission >= 0 )
{
      $progress++;
      $target = shift @mission;
      @links = read_file( $target );

      # 获取文件名作为目录名
      ($subfold, undef, undef)  = fileparse($target, qr/\.[^.]*$/);
      printf "[%d] [%03d/%03d] %s\n", threads->tid(), $progress, $total, $subfold;
      mkdir $subfold unless -e $subfold;

      get_pics( threads->tid(), $ua, $subfold, \@links );
}
}

sub get_pics
{
our %headers;
my ($id, $ua, $fold, $links) = @_;
my $res;
my $filepath;
my $times;

for my $e ( @$links )
{
      $e=~s/\r?\n//;
      next if ( $e !~ /(jpg|png|bmp|gif)/i );
      $filepath = $fold ."/". basename($e);
      if ( -e $filepath and ( check_jpg_file_tail( $filepath ) == 1 ) )
      {
         #printf " [%d] %s file exists\n", $id, $filepath;
         next;
      }

      printf " [%d] %s\n", $id, $filepath;
      $times = 0;
      while (1)
      {
         try { $res = $ua->get($e, \%headers)->result;  }
         catch { printf "getting pics, retry: %d\n", $times++; };
         last if ( defined $res and $res->is_success );
         return if ( $times > 10 );
      }

      $res->content->asset->move_to( $filepath );
}
}

sub check_jpg_file_tail
{
my $file = shift;
my ($fh, $buff);
open $fh, "<:raw", $file or warn "$!";
seek($fh, -2, 2);
read($fh, $buff, 2);
if ( $buff eq "\xFF\xD9" ) { return 1 }
else { return 0 }
}

sub quit
{
print "Press Any Key to Continue ...";
ReadKey -1;
}[/code]

页: [1]

批处理之家's Archiver

[福利]多线程爬取某网站图片