标题: [原创代码] [Perl]抓取句子大全的网页 [打印本页]
作者: 523066680 时间: 2018-10-4 16:28 标题: [Perl]抓取句子大全的网页
本帖最后由 523066680 于 2018-10-4 16:32 编辑
[Perl]抓取句子大全的网页
分两步,第一步先提取网页,然后再本地提取文本。
可以中途终止脚本,重新打开后会略过已经完成的条目。
抓取的 HTML 保存在 D:/temp/句子大全 文件夹- =info
- 523066680/vicyang
- 2018-10
- =cut
-
- use utf8;
- use Encode;
- use File::Path;
- use File::Slurp;
- use LWP::UserAgent;
- use File::Path;
- use File::Basename qw/basename/;
- use Mojo::DOM;
- STDOUT->autoflush(1);
-
- our $wdir = encode('gbk', "D:/temp/句子大全");
- mkpath $wdir unless -e $wdir;
- our $main = "http://www.1juzi.com";
- our $ua = LWP::UserAgent->new( keep_alive => 1, timemout => 8 );
- my $res = $ua->get($main);
- my $html = $res->content();
- my $dom = Mojo::DOM->new($html);
-
- my (@urls, @dirs);
- get_item($dom, \@urls, \@dirs);
-
- my $tdir;
- for my $id ( 0 .. $#urls )
- {
- printf "%s\n", $dirs[$id];
- next if -e $dirs[$id]; # Skip when this folder exists
- $tdir = $dirs[$id] ."_";
- mkpath $tdir unless -e $tdir;
- get_alist( $main .$urls[$id], $tdir );
- rename( $tdir , $dirs[$id] ); # Restore name
- }
-
- sub get_item
- {
- our $wdir;
- my ($dom, $urls, $dirs) = @_;
- my $menu = $dom->at(".header-menu");
-
- for my $e ( $menu->find("ul li a")->each )
- {
- push @$urls, $e->attr("href");
- push @$dirs, sprintf "%s/%s/%s", $wdir, $e->parent->parent->previous->text, $e->text;
- }
- }
-
- sub get_alist
- {
- our $main;
- my ($url, $dir) = @_;
- my $res = $ua->get( $url );
- my $dom = Mojo::DOM->new( $res->content );
- my @links;
- @links = @{ $dom->at(".alist")->find("a")->map(attr=>"href") };
-
- #get_page
- my $retry;
- for my $link ( @links )
- {
- printf " %s\n", $link;
- $retry = 0;
- do
- {
- $res = $ua->get( $main .$link );
- $retry++;
- print "retry times: $retry\n" if ($retry > 1 );
- }
- until ( $res->is_success() );
-
- write_file( $dir ."/". basename($link), $res->content );
- }
- }
复制代码
作者: 523066680 时间: 2018-10-4 16:49 标题: 第二段代码,从本地的HTML提取正文,汇总到TXT
本帖最后由 523066680 于 2018-10-4 21:59 编辑
话说这个网站是不是被攻击了?
文章夹杂各种 <script> 例如- <script>Baidu_A_D("b1");</script>
- <script type="text/javascript">news1();</script>
复制代码
做了remove处理- =info
- 523066680/vicyang
- 2018-10
- =cut
-
- use utf8;
- use Encode;
- use File::Slurp;
- use Mojo::DOM;
- STDOUT->autoflush(1);
-
- our $wdir = encode('gbk', "D:/Temp/句子大全");
- chdir $wdir or warn "$!";
-
- my $buff;
- my @files;
- my @dirs = `dir "$wdir" /ad /s /b`;
- grep { s/\r?\n//; } @dirs;
-
- for my $dir ( @dirs )
- {
- printf "%s\n", $dir;
- chdir $dir or die "$!";
- @files = glob "*.html";
- next unless $#files >= 0;
- $buff = "";
- grep { $buff .= article( $_ ) } sort { substr($b, 0, -5) <=> substr($a, 0, -5) } @files;
- write_file( "${dir}.txt", $buff );
- }
-
- sub article
- {
- my $file = shift;
- my $html = decode('gbk', scalar(read_file( $file )) );
- $html =~s/ //g;
-
- $dom = Mojo::DOM->new( $html );
- # remove tags: <script>, <u>, and next/prev page
- grep { $_->remove } $dom->at(".content")->find("script")->each;
- grep { $_->remove } $dom->at(".content")->find("u")->each;
- $dom->at(".page")->remove;
- my $title = $dom->at("h1")->all_text;
- my $text = $dom->at(".content")->all_text;
-
- $text =~s/(\d+、)/\n$1/g;
- $text =~s/\Q$title\E//;
- $text =~s/[\r\n]+/\n/g;
- $text =~s/^\n//;
-
- my $str;
- $str = sprintf "%s\n", encode('gbk', $title );
- $str .= sprintf "%s\n", $file;
- $str .= sprintf "%s\n", encode('gbk', $text);
- return $str;
- }
复制代码
作者: 523066680 时间: 2018-10-4 17:18
本帖最后由 523066680 于 2018-10-4 17:40 编辑
整理结果保存到网盘了 http://523066680.ys168.com/ -> 临时 -> 网页提取 -> 句子大全_text.zip
句子大全_text.zip
问题
1. 剔除空格会把英语单词之间的空格替换掉,已改
2. 这个网站每个条目下的文章列表并不是完整列表,有很多是没有直接展示的。通过上一页下一页会翻到一些其他文章。
作者: 523066680 时间: 2018-10-4 19:17
本帖最后由 523066680 于 2018-10-5 15:55 编辑
在写多线程按页码抓,估计几个G ……
--补充1
扒完了,146157页,2.76G。
提取出来的文本,800MB 不想上传了(逃
--补充2
运行了一下脚本,发现更新了,146342了
作者: 523066680 时间: 2018-10-5 16:13
本帖最后由 523066680 于 2018-10-5 16:25 编辑
迭代页码($iter变量)并尝试获取网页,失败5次以后判断为结束。已经存在的网页会略过。- =info
- 523066680/vicyang
- 2018-10
- =cut
-
- use Modern::Perl;
- use utf8;
- use Encode;
- use File::Path;
- use File::Slurp;
- use LWP::UserAgent;
- use Mojo::DOM;
- use threads;
- use threads::shared;
- use Time::HiRes qw/sleep time/;
- STDOUT->autoflush(1);
-
- our $idx = 0;
- our @ths;
- our $iter :shared;
- our $failed :shared;
-
- our $main = "http://www.1juzi.com";
- our $wdir = encode('gbk', "D:/temp/句子大全_byNo.");
- mkpath $wdir unless -e $wdir;
- chdir $wdir;
-
- $iter = 1;
- $failed = 0;
- #创建线程
- grep { push @ths, threads->create( \&func, $_ ) } ( 0 .. 3 );
-
- while ( $failed <= 5 ) { sleep 1.0; }
-
- #线程终结和分离
- grep { $_->detach() } threads->list(threads::all);
-
- sub func
- {
- our ($main, $failed, $iter);
- my $idx = shift;
- my ($link, $file);
- my $ua = LWP::UserAgent->new( keep_alive => 1, timemout => 6 );
-
- $SIG{'BREAK'} = sub { threads->exit() };
-
- my $res;
- my $retry;
- my $task;
-
- while (1)
- {
- {
- lock($iter);
- $task = $iter++;
- }
-
- $link = "${main}/new/${task}.html";
- $file = "${task}.html";
- if ( -e $file ) { printf "%s exists\n", $file; next; }
-
- printf "%s\n", $file;
- $retry = 0;
- do
- {
- $res = $ua->get( $link );
- if ($retry > 0)
- {
- printf "[%d]%s%s, retry times: %d\n", $idx, " "x($idx+1), $file, $retry;
- sleep 0.5;
- }
- $retry++;
- }
- until ( $res->is_success() or ($retry > 3) );
-
- if ( $res->is_success ) { write_file( $file, $res->content ); }
- else { $failed++; }
- }
- }
复制代码
作者: 523066680 时间: 2018-10-5 16:25
- =info
- 523066680/vicyang
- 2018-10
- =cut
-
- use utf8;
- use Encode;
- use File::Path;
- use File::Slurp;
- use Mojo::DOM;
- STDOUT->autoflush(1);
-
- our $wdir = encode('gbk', "D:/Temp/句子大全_byNo");
- chdir $wdir or warn "$!";
-
- our %FH;
- my @files = `dir "$wdir" /a-d /b`;
- grep { s/\r?\n// } @files;
- @files = sort { substr($a, 0, -5) <=> substr($b, 0, -5) } @files;
-
- grep {
- article($_);
- printf "%s\n", $_;
- } @files;
-
- for my $v (values %FH) { close $v }
-
- sub article
- {
- our %FH;
- my $page = shift;
- my $html = decode('gbk', scalar(read_file( $page )) );
- $html =~s/ //g;
-
- $dom = Mojo::DOM->new( $html );
- # path
- my @path = @{ $dom->at(".path")->find("a")->map("text") };
- grep { $_ = encode("gbk", $_) } @path;
-
- my $path = "../". join("/", @path[0,1]);
- my $file = "${path}/${path[-1]}.txt";
- mkpath $path unless -e $path;
-
- unless ( exists $FH{$file} )
- {
- printf "create %s\n", $file;
- open $FH{$file}, ">:raw:crlf", $file;
- $FH{$file}->autoflush(1);
- }
-
- # remove tags: <script>, <u>, and next/prev page
- grep { $_->remove } $dom->at(".content")->find("script")->each;
- grep { $_->remove } $dom->at(".content")->find("u")->each;
- $dom->at(".page")->remove;
- my $title = $dom->at("h1")->all_text;
- my $text = $dom->at(".content")->all_text;
-
- $text =~s/(\d+、)/\n$1/g;
- $text =~s/\Q$title\E//;
- $text =~s/[\r\n]+/\n/g;
- $text =~s/^\n//;
-
- my $str;
- $str = sprintf "%s\n", encode('gbk', $title );
- $str .= sprintf "%s\n", $page;
- $str .= sprintf "%s\n", encode('gbk', $text);
-
- print { $FH{$file} } $str;
- }
复制代码
作者: 523066680 时间: 2018-10-6 08:18
部分结果 在我的网盘 http://523066680.ys168.com/
临时/网页提取
欢迎光临 批处理之家 (http://bbs.bathome.net/) |
Powered by Discuz! 7.2 |