- =info
- 523066680/vicyang
- 2018-10
- =cut
-
- use utf8;
- use Encode;
- use File::Path;
- use File::Slurp;
- use Mojo::DOM;
- STDOUT->autoflush(1);
-
- our $wdir = encode('gbk', "D:/Temp/句子大全_byNo");
- chdir $wdir or warn "$!";
-
- our %FH;
- my @files = `dir "$wdir" /a-d /b`;
- grep { s/\r?\n// } @files;
- @files = sort { substr($a, 0, -5) <=> substr($b, 0, -5) } @files;
-
- grep {
- article($_);
- printf "%s\n", $_;
- } @files;
-
- for my $v (values %FH) { close $v }
-
- sub article
- {
- our %FH;
- my $page = shift;
- my $html = decode('gbk', scalar(read_file( $page )) );
- $html =~s/ //g;
-
- $dom = Mojo::DOM->new( $html );
- # path
- my @path = @{ $dom->at(".path")->find("a")->map("text") };
- grep { $_ = encode("gbk", $_) } @path;
-
- my $path = "../". join("/", @path[0,1]);
- my $file = "${path}/${path[-1]}.txt";
- mkpath $path unless -e $path;
-
- unless ( exists $FH{$file} )
- {
- printf "create %s\n", $file;
- open $FH{$file}, ">:raw:crlf", $file;
- $FH{$file}->autoflush(1);
- }
-
- # remove tags: <script>, <u>, and next/prev page
- grep { $_->remove } $dom->at(".content")->find("script")->each;
- grep { $_->remove } $dom->at(".content")->find("u")->each;
- $dom->at(".page")->remove;
- my $title = $dom->at("h1")->all_text;
- my $text = $dom->at(".content")->all_text;
-
- $text =~s/(\d+、)/\n$1/g;
- $text =~s/\Q$title\E//;
- $text =~s/[\r\n]+/\n/g;
- $text =~s/^\n//;
-
- my $str;
- $str = sprintf "%s\n", encode('gbk', $title );
- $str .= sprintf "%s\n", $page;
- $str .= sprintf "%s\n", encode('gbk', $text);
-
- print { $FH{$file} } $str;
- }
复制代码
|