批处理之家's Archiver

523066680 发表于 2018-1-18 19:42

[福利]多线程爬取某网站图片

[i=s] 本帖最后由 523066680 于 2018-1-19 07:37 编辑 [/i]

推荐环境: Strawberry Perl[code]=info
    Author: 523066680/vicyang
    Date: 2018-01-16
=cut

use Modern::Perl;
use threads;
use threads::shared;
use File::Slurp;
use File::Path qw/make_path/;
use File::Basename;
use Mojo::UserAgent;
use Mojo::DOM;
use Try::Tiny;
use Time::HiRes qw/sleep time/;
use Term::ReadKey;
use IO::Handle;
STDOUT->autoflush(1);

our $main = "http://www.elitebabes.com/model/katherine-a";
our $workdir = "D:/Hex/w4b_models/". basename($main);
make_path $workdir unless ( -e $workdir );
chdir $workdir;
mkdir "links" unless ( -e "links" );

our $progress :shared;
our $total    :shared;
our @ths;
our @mission :shared; #共享到线程
our %headers = (
    'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
    'Referer' => 'http://www.elitebabes.com',
    'Connection' => 'keep-alive',
    );

say "Step1";
get_models_data($main);

say "Step2";
@mission = glob "'$workdir/links/*.txt'";

$progress = 0;
$total = scalar( @mission );

#创建线程
grep { push @ths, threads->create( \&thread_func, $_ ) } ( 0 .. 3 );

#等待运行结束
while ( threads->list(threads::running) ) { sleep 0.2 };

#线程分离/结束
grep { $_->detach() } threads->list(threads::all);

quit();

sub get_models_data
{
    my ( $page ) = @_;
    my ($title, $subpg, $count);

    my $ua = Mojo::UserAgent->new();
    my $res;

    $res = $ua->get( $page, \%headers )->result;
    my $dom = $res->dom;
    get_info( $page, $dom, "info.txt" );

    $count = 0;
    for my $e ( $dom->find("ul.gallery-a a")->each )
    {
        $count++;
        $subpg = $e->attr("href");
        $title = $e->attr("title");
        $title = basename( $subpg ) if ( $title eq "" );
        $title =~s/\s+$//; #去掉可能出现的末尾空格

        get_piclinks_of_subpage( $ua, $title, $subpg, $count );
    }
}

sub get_info
{
    my ( $page, $dom, $file ) = @_;
    my @data = ($page);
    my ($like, $unlike);

    return if ( -e $file );

    for my $e ( $dom->at("ul.list-a")->find('li')->each  ) {
        push @data, $e->at('span')->text . $e->text ;
    }

    $like = $dom->at("span#thelike")->text;
    $unlike = $dom->at("span#thedown")->text;
    push @data, "like: $like";
    push @data, "unlike: $unlike";
    write_file( $file, join("\n", @data) );
}

sub get_piclinks_of_subpage
{
    my ($ua, $title, $subpage, $count) = @_;

    my ( $res, $dom, $href );
    my ($times);
    my @links;
    my $file = "./links/${title}.txt";

    if ( -e $file )
    {
        printf "%03d - %s file already exists\n", $count, $title;
        return;
    }

    $times = 0;
    while (1)
    {
        try { $res = $ua->get($subpage)->result  }
        catch { printf "getting subpage, retry: %d\n", $times++; };
        last if ( defined $res and $res->is_success );
        return if ( $times > 10 );
    }

    $dom = $res->dom;
    # find pics
    for my $e ( $dom->find(".gallery-b a")->each ) {
        push @links, $e->attr("href");
    }

    # find video
    for my $e ( $dom->find("video,.my_video*")->each ) {
        push @links, $e->at("source")->attr("src");
    }

    if ( $#links < 0 ) { printf "fail to get media\n" }
    else
    {
        printf "%03d - %s\n", $count, $title;
        write_file( $file, join("\n", @links) );
    }
}

sub thread_func
{
    our (@mission, @headers, $progress, $total);
    my $idx = shift;
    my $time_a;
    my $target;
    my $subfold;
    my @links;
    my $ua = Mojo::UserAgent->new();
    $ua = $ua->max_redirects(5);

    $SIG{'BREAK'} = sub { threads->exit() };

    while ( $#mission >= 0 )
    {
        $progress++;
        $target = shift @mission;
        @links = read_file( $target );

        # 获取文件名作为目录名
        ($subfold, undef, undef)  = fileparse($target, qr/\.[^.]*$/);
        printf "[%d] [%03d/%03d] %s\n", threads->tid(), $progress, $total, $subfold;
        mkdir $subfold unless -e $subfold;

        get_pics( threads->tid(), $ua, $subfold, \@links );
    }
}

sub get_pics
{
    our %headers;
    my ($id, $ua, $fold, $links) = @_;
    my $res;
    my $filepath;
    my $times;

    for my $e ( @$links )
    {
        $e=~s/\r?\n//;
        next if ( $e !~ /(jpg|png|bmp|gif)/i );
        $filepath = $fold ."/". basename($e);
        if ( -e $filepath and ( check_jpg_file_tail( $filepath ) == 1 ) )
        {
            #printf "    [%d] %s file exists\n", $id, $filepath;
            next;
        }
        
        printf "    [%d] %s\n", $id, $filepath;
        $times = 0;
        while (1)
        {
            try { $res = $ua->get($e, \%headers)->result;  }
            catch { printf "getting pics, retry: %d\n", $times++; };
            last if ( defined $res and $res->is_success );
            return if ( $times > 10 );
        }

        $res->content->asset->move_to( $filepath );
    }
}

sub check_jpg_file_tail
{
    my $file = shift;
    my ($fh, $buff);
    open $fh, "<:raw", $file or warn "$!";
    seek($fh, -2, 2);
    read($fh, $buff, 2);
    if ( $buff eq "\xFF\xD9" ) { return 1 }
    else { return 0 }
}

sub quit
{
    print "Press Any Key to Continue ...";
    ReadKey -1;
}[/code]

页: [1]

Powered by Discuz! Archiver 7.2  © 2001-2009 Comsenz Inc.