本帖最后由 523066680 于 2018-12-7 20:07 编辑
提取正文的代码,Mojo::DOM 解析HTML,提取元素。
保存的文件名格式为 :作者+标题+诗歌类型+页面ID | | | | | | | | | | | use Mojo::DOM; | | use File::Slurp; | | use Encode; | | use File::Path; | | use Try::Tiny; | | use File::Basename qw/basename/; | | use Win32API::File qw(:ALL); | | STDOUT->autoflush(1); | | | | our $src = "D:/temp/52shici_mobile"; | | our $dst = "D:/temp/52shici_extract"; | | my $item; | | my (@files, %already, $pgname, $dirlist); | | my ($rate, $prev); | | | | for my $subdir ( glob "${src}/*" ) | | { | | printf "%s\n", $subdir; | | $item = basename($subdir); | | mkpath "${dst}/${item}" unless -e "${dst}/${item}"; | | | | | | @files = glob "${subdir}/*"; | | | | | | %already = (); | | my $dirlist = decode("utf16-le", `cmd /U /C dir /b \"${dst}/${item}\"`); | | grep { $_=~/(works_id=\d+)/; $already{$1} = 1; } split("\r\n", $dirlist); | | | | ($rate, $prev) = (0.0, 0.0); | | for my $id ( 0 .. $#files ) | | { | | | | $rate = $id / $#files * 100.0; | | if ( ($rate-$prev) >= 1.0 ) { | | printf "%d\% ", $rate; | | $prev = $rate; | | } | | $pgname = basename($files[$id], ".html"); | | next if exists $already{$pgname}; | | abstract( "${dst}/${item}", $item, $files[$id] ); | | } | | printf "\n"; | | | | exit; | | } | | | | sub abstract | | { | | my ($path, $item, $page) = @_; | | my $html = read_file( $page ); | | $html=~s/\ //g; | | | | | | my $dom = Mojo::DOM->new( $html ); | | my $buff = ""; | | my ($fname, $head) = ("", ""); | | my $id = basename($page, ".html"); | | | | | | my $author = $dom->at(".works-author a")->text; | | my $title = $dom->at(".works-title")->text; | | my $type = $dom->at(".works-type")->text; | | my $date = $dom->at(".works-author")->text; | | | | $author =~s/^\s+//; | | $date =~s/:/./g; | | $head = join(" ", $author, $title, $type, $date ); | | $fname = join(" ", $author, $title, $type, $id ); | | | | | | $buff .= $head; | | $buff .= $dom->at("#content_box")->all_text; | | | | $buff=~s/\r?\n([ \t]+)?/#MARK/g; | | $buff=~s/ {2,}/ /sge; | | $buff=~s/(#MARK){2,}/#MARK#MARK/g; | | $buff=~s/#MARK/\r\n/g; | | | | | | $path = decode('gbk', $path); | | $fname = decode('utf8', $fname); | | | | | | $fname =~s/\p{IsCntrl}//g; | | create_with_unicode_fname( $path, $fname, \$buff ); | | } | | | | sub create_with_unicode_fname | | { | | my ($path, $title, $buff) = @_; | | $title =~s/[\Q*?":<>|\\\/\E]/ /g; | | $path .= "/". $title .".txt\0\0"; | | $path = encode('utf16-le', $path); | | | | my $F = CreateFileW( $path, GENERIC_WRITE, 0, [], OPEN_ALWAYS, 0, 0); | | | | | | try { | | OsFHandleOpen(FILE, $F, "w") or die "Cannot open file"; | | } catch { printf "\nCan't create file: %s\n", encode('gbk', $title); return; }; | | binmode FILE; | | print FILE $$buff; | | close(FILE); | | } | | | | sub utf2gbk { return encode('gbk', decode('utf8', $_[0] )); }COPY |
|