本帖最后由 523066680 于 2018-12-7 20:07 编辑
提取正文的代码,Mojo::DOM 解析HTML,提取元素。
保存的文件名格式为 :作者+标题+诗歌类型+页面ID- =info
- 523066680/vicyang
- 2018-12
- =cut
-
- use Mojo::DOM;
- use File::Slurp;
- use Encode;
- use File::Path;
- use Try::Tiny;
- use File::Basename qw/basename/;
- use Win32API::File qw(:ALL);
- STDOUT->autoflush(1);
-
- our $src = "D:/temp/52shici_mobile";
- our $dst = "D:/temp/52shici_extract";
- my $item;
- my (@files, %already, $pgname, $dirlist);
- my ($rate, $prev);
-
- for my $subdir ( glob "${src}/*" )
- {
- printf "%s\n", $subdir;
- $item = basename($subdir);
- mkpath "${dst}/${item}" unless -e "${dst}/${item}";
-
- # 需要处理的文件列表
- @files = glob "${subdir}/*";
-
- # 创建现有名单哈希表
- %already = ();
- my $dirlist = decode("utf16-le", `cmd /U /C dir /b \"${dst}/${item}\"`);
- grep { $_=~/(works_id=\d+)/; $already{$1} = 1; } split("\r\n", $dirlist);
-
- ($rate, $prev) = (0.0, 0.0);
- for my $id ( 0 .. $#files )
- {
- # 显示进度,百分率
- $rate = $id / $#files * 100.0;
- if ( ($rate-$prev) >= 1.0 ) {
- printf "%d\% ", $rate;
- $prev = $rate;
- }
- $pgname = basename($files[$id], ".html");
- next if exists $already{$pgname};
- abstract( "${dst}/${item}", $item, $files[$id] );
- }
- printf "\n";
-
- exit;
- }
-
- sub abstract
- {
- my ($path, $item, $page) = @_;
- my $html = read_file( $page );
- $html=~s/\ //g;
- #$html=~s/<script>.*$//es;
-
- my $dom = Mojo::DOM->new( $html );
- my $buff = "";
- my ($fname, $head) = ("", "");
- my $id = basename($page, ".html");
-
- # 作者 标题 类型 日期
- my $author = $dom->at(".works-author a")->text;
- my $title = $dom->at(".works-title")->text;
- my $type = $dom->at(".works-type")->text;
- my $date = $dom->at(".works-author")->text;
-
- $author =~s/^\s+//; # 去掉开头space
- $date =~s/:/./g;
- $head = join(" ", $author, $title, $type, $date );
- $fname = join(" ", $author, $title, $type, $id );
- #printf "%s %s\n", utf2gbk($fname), $page;
-
- $buff .= $head;
- $buff .= $dom->at("#content_box")->all_text;
-
- $buff=~s/\r?\n([ \t]+)?/#MARK/g;
- $buff=~s/ {2,}/ /sge; # 合并连续空格
- $buff=~s/(#MARK){2,}/#MARK#MARK/g; # 合并超过两行的换行
- $buff=~s/#MARK/\r\n/g;
-
- #write_file( "${path}/${title}.txt", {binmode => ":raw"}, $buff );
- $path = decode('gbk', $path);
- $fname = decode('utf8', $fname);
-
- #去掉控制字符(某些文章标题就出现了<0x08>,<0x03>),并且是在Unicode编码下替换
- $fname =~s/\p{IsCntrl}//g;
- create_with_unicode_fname( $path, $fname, \$buff );
- }
-
- sub create_with_unicode_fname
- {
- my ($path, $title, $buff) = @_;
- $title =~s/[\Q*?":<>|\\\/\E]/ /g; # 去掉windows文件名敏感字符
- $path .= "/". $title .".txt\0\0"; # 合并路径,追加 \0\0
- $path = encode('utf16-le', $path);
-
- my $F = CreateFileW( $path, GENERIC_WRITE, 0, [], OPEN_ALWAYS, 0, 0);
-
- # 这里 FILE 句柄不支持 $FILE 变量形式
- try {
- OsFHandleOpen(FILE, $F, "w") or die "Cannot open file";
- } catch { printf "\nCan't create file: %s\n", encode('gbk', $title); return; };
- binmode FILE;
- print FILE $$buff;
- close(FILE);
- }
-
- sub utf2gbk { return encode('gbk', decode('utf8', $_[0] )); }
复制代码
|