[Perl]抓取句子大全的网页
[i=s] 本帖最后由 523066680 于 2018-10-4 16:32 编辑 [/i][b][Perl]抓取句子大全的网页[/b]
分两步,第一步先提取网页,然后再本地提取文本。
可以中途终止脚本,重新打开后会略过已经完成的条目。
抓取的 HTML 保存在 D:/temp/句子大全 文件夹[code]=info
523066680/vicyang
2018-10
=cut
use utf8;
use Encode;
use File::Path;
use File::Slurp;
use LWP::UserAgent;
use File::Path;
use File::Basename qw/basename/;
use Mojo::DOM;
STDOUT->autoflush(1);
our $wdir = encode('gbk', "D:/temp/句子大全");
mkpath $wdir unless -e $wdir;
our $main = "http://www.1juzi.com";
our $ua = LWP::UserAgent->new( keep_alive => 1, timemout => 8 );
my $res = $ua->get($main);
my $html = $res->content();
my $dom = Mojo::DOM->new($html);
my (@urls, @dirs);
get_item($dom, \@urls, \@dirs);
my $tdir;
for my $id ( 0 .. $#urls )
{
printf "%s\n", $dirs[$id];
next if -e $dirs[$id]; # Skip when this folder exists
$tdir = $dirs[$id] ."_";
mkpath $tdir unless -e $tdir;
get_alist( $main .$urls[$id], $tdir );
rename( $tdir , $dirs[$id] ); # Restore name
}
sub get_item
{
our $wdir;
my ($dom, $urls, $dirs) = @_;
my $menu = $dom->at(".header-menu");
for my $e ( $menu->find("ul li a")->each )
{
push @$urls, $e->attr("href");
push @$dirs, sprintf "%s/%s/%s", $wdir, $e->parent->parent->previous->text, $e->text;
}
}
sub get_alist
{
our $main;
my ($url, $dir) = @_;
my $res = $ua->get( $url );
my $dom = Mojo::DOM->new( $res->content );
my @links;
@links = @{ $dom->at(".alist")->find("a")->map(attr=>"href") };
#get_page
my $retry;
for my $link ( @links )
{
printf " %s\n", $link;
$retry = 0;
do
{
$res = $ua->get( $main .$link );
$retry++;
print "retry times: $retry\n" if ($retry > 1 );
}
until ( $res->is_success() );
write_file( $dir ."/". basename($link), $res->content );
}
}
[/code]
第二段代码,从本地的HTML提取正文,汇总到TXT
[i=s] 本帖最后由 523066680 于 2018-10-4 21:59 编辑 [/i]话说这个网站是不是被攻击了?
文章夹杂各种 <script> 例如[code]
<script>Baidu_A_D("b1");</script>
<script type="text/javascript">news1();</script>[/code]做了remove处理[code]=info
523066680/vicyang
2018-10
=cut
use utf8;
use Encode;
use File::Slurp;
use Mojo::DOM;
STDOUT->autoflush(1);
our $wdir = encode('gbk', "D:/Temp/句子大全");
chdir $wdir or warn "$!";
my $buff;
my @files;
my @dirs = `dir "$wdir" /ad /s /b`;
grep { s/\r?\n//; } @dirs;
for my $dir ( @dirs )
{
printf "%s\n", $dir;
chdir $dir or die "$!";
@files = glob "*.html";
next unless $#files >= 0;
$buff = "";
grep { $buff .= article( $_ ) } sort { substr($b, 0, -5) <=> substr($a, 0, -5) } @files;
write_file( "${dir}.txt", $buff );
}
sub article
{
my $file = shift;
my $html = decode('gbk', scalar(read_file( $file )) );
$html =~s/ //g;
$dom = Mojo::DOM->new( $html );
# remove tags: <script>, <u>, and next/prev page
grep { $_->remove } $dom->at(".content")->find("script")->each;
grep { $_->remove } $dom->at(".content")->find("u")->each;
$dom->at(".page")->remove;
my $title = $dom->at("h1")->all_text;
my $text = $dom->at(".content")->all_text;
$text =~s/(\d+、)/\n$1/g;
$text =~s/\Q$title\E//;
$text =~s/[\r\n]+/\n/g;
$text =~s/^\n//;
my $str;
$str = sprintf "%s\n", encode('gbk', $title );
$str .= sprintf "%s\n", $file;
$str .= sprintf "%s\n", encode('gbk', $text);
return $str;
}[/code] [i=s] 本帖最后由 523066680 于 2018-10-4 17:40 编辑 [/i]
整理结果保存到网盘了 [url]http://523066680.ys168.com/[/url] -> 临时 -> 网页提取 -> 句子大全_text.zip
[url=http://ys-o.ys168.com/205774942/i7N424H6IG6P45hVRGkM/句子大全_text.zip]句子大全_text.zip[/url]
问题
1. 剔除空格会把英语单词之间的空格替换掉,已改
2. 这个网站每个条目下的文章列表并不是完整列表,有很多是没有直接展示的。通过上一页下一页会翻到一些其他文章。 [i=s] 本帖最后由 523066680 于 2018-10-5 15:55 编辑 [/i]
在写多线程按页码抓,估计几个G ……
--补充1
扒完了,146157页,2.76G。
提取出来的文本,800MB 不想上传了(逃
--补充2
运行了一下脚本,发现更新了,146342了 [i=s] 本帖最后由 523066680 于 2018-10-5 16:25 编辑 [/i]
[list]
[*][size=4][b]多线程抓取HTML[/b][/size]
[/list]
迭代页码($iter变量)并尝试获取网页,失败5次以后判断为结束。已经存在的网页会略过。[code]=info
523066680/vicyang
2018-10
=cut
use Modern::Perl;
use utf8;
use Encode;
use File::Path;
use File::Slurp;
use LWP::UserAgent;
use Mojo::DOM;
use threads;
use threads::shared;
use Time::HiRes qw/sleep time/;
STDOUT->autoflush(1);
our $idx = 0;
our @ths;
our $iter :shared;
our $failed :shared;
our $main = "http://www.1juzi.com";
our $wdir = encode('gbk', "D:/temp/句子大全_byNo.");
mkpath $wdir unless -e $wdir;
chdir $wdir;
$iter = 1;
$failed = 0;
#创建线程
grep { push @ths, threads->create( \&func, $_ ) } ( 0 .. 3 );
while ( $failed <= 5 ) { sleep 1.0; }
#线程终结和分离
grep { $_->detach() } threads->list(threads::all);
sub func
{
our ($main, $failed, $iter);
my $idx = shift;
my ($link, $file);
my $ua = LWP::UserAgent->new( keep_alive => 1, timemout => 6 );
$SIG{'BREAK'} = sub { threads->exit() };
my $res;
my $retry;
my $task;
while (1)
{
{
lock($iter);
$task = $iter++;
}
$link = "${main}/new/${task}.html";
$file = "${task}.html";
if ( -e $file ) { printf "%s exists\n", $file; next; }
printf "%s\n", $file;
$retry = 0;
do
{
$res = $ua->get( $link );
if ($retry > 0)
{
printf "[%d]%s%s, retry times: %d\n", $idx, " "x($idx+1), $file, $retry;
sleep 0.5;
}
$retry++;
}
until ( $res->is_success() or ($retry > 3) );
if ( $res->is_success ) { write_file( $file, $res->content ); }
else { $failed++; }
}
}
[/code] [list]
[*][size=4][b]内容提取 分类汇总[/b][/size]
[/list][code]=info
523066680/vicyang
2018-10
=cut
use utf8;
use Encode;
use File::Path;
use File::Slurp;
use Mojo::DOM;
STDOUT->autoflush(1);
our $wdir = encode('gbk', "D:/Temp/句子大全_byNo");
chdir $wdir or warn "$!";
our %FH;
my @files = `dir "$wdir" /a-d /b`;
grep { s/\r?\n// } @files;
@files = sort { substr($a, 0, -5) <=> substr($b, 0, -5) } @files;
grep {
article($_);
printf "%s\n", $_;
} @files;
for my $v (values %FH) { close $v }
sub article
{
our %FH;
my $page = shift;
my $html = decode('gbk', scalar(read_file( $page )) );
$html =~s/ //g;
$dom = Mojo::DOM->new( $html );
# path
my @path = @{ $dom->at(".path")->find("a")->map("text") };
grep { $_ = encode("gbk", $_) } @path;
my $path = "../". join("/", @path[0,1]);
my $file = "${path}/${path[-1]}.txt";
mkpath $path unless -e $path;
unless ( exists $FH{$file} )
{
printf "create %s\n", $file;
open $FH{$file}, ">:raw:crlf", $file;
$FH{$file}->autoflush(1);
}
# remove tags: <script>, <u>, and next/prev page
grep { $_->remove } $dom->at(".content")->find("script")->each;
grep { $_->remove } $dom->at(".content")->find("u")->each;
$dom->at(".page")->remove;
my $title = $dom->at("h1")->all_text;
my $text = $dom->at(".content")->all_text;
$text =~s/(\d+、)/\n$1/g;
$text =~s/\Q$title\E//;
$text =~s/[\r\n]+/\n/g;
$text =~s/^\n//;
my $str;
$str = sprintf "%s\n", encode('gbk', $title );
$str .= sprintf "%s\n", $page;
$str .= sprintf "%s\n", encode('gbk', $text);
print { $FH{$file} } $str;
}
[/code] 部分结果 在我的网盘 [url]http://523066680.ys168.com/[/url]
临时/网页提取
页:
[1]