标题: 【完结】抓取电视剧剧情+各50元 [打印本页]
作者: lxh623 时间: 2021-10-10 17:47 标题: 【完结】抓取电视剧剧情+各50元
本帖最后由 lxh623 于 2021-10-12 07:21 编辑
文本内是网址、一行一个。
文本c是,https://www.tvzn.com/8406/
文本d是,https://www.tvmao.com/drama/Ym4mXGVf
想得到如下结果:(写入新的文本)
TTT上海沧桑
第1集
1925年,湖南醴陵。................
第2集
................
................
两个网站,一个五十元。
谢谢!
作者: zaqmlp 时间: 2021-10-10 19:32
本帖最后由 zaqmlp 于 2021-10-10 20:53 编辑
- <# :
- rem www.tvzn.com
- rem 另存为ANSI编码的bat
- cls&echo off&cd /d "%~dp0"
- powershell -NoProfile -ExecutionPolicy bypass "[IO.File]::ReadAllText('%~f0',[Text.Encoding]::GetEncoding('GB2312'))|Invoke-Expression"
- pause
- exit
- #>
- $infile='.\文本c.txt';
- $outfile='.\结果.txt';
- if(-not (test-path -liter $infile)){write-host ('"'+$infile+'" 未找到');exit;};
-
- $enc=[System.Text.Encoding]::UTF8;
- $webclient=New-Object System.Net.WebClient;
- function gethtml($u){
- $t='';
- for($j=1;$j -le 3;$j++){
- try{
- $webclient.Headers.Add('User-Agent','Mozilla/5.0 (Windows NT 6.1; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36');
- $webclient.Encoding=$enc;
- $t=$webclient.DownloadString($u);
- break;
- }catch{
- write-host ('第'+$j.toString()+'次获取网页源码失败');
- start-sleep -Seconds 3;
- };
- };
- return $t;
- };
-
- $fs=New-Object System.IO.FileStream($outfile, [System.IO.FileMode]::Create);
- $sw=New-Object System.IO.StreamWriter($fs, $enc);
- $text=[IO.File]::ReadAllText($infile, $enc).split("`r`n",[StringSplitOptions]::RemoveEmptyEntries);
- for($i=0;$i -lt $text.Count;$i++){
- write-host ('--------------'+$text[$i]+'--------------');
- $url=$text[$i].trimend(' /')
- $url1=$url+'/juqingjieshao.html';
- $html1=gethtml $url1;
- $title='TTT';
- $m1=[regex]::match($html1,'>([^<]+?)</h1></div>');
- if($m1.Success){$title=$title+$m1.groups[1].value.replace('剧情介绍','');}
- write-host $title;
- $sw.WriteLine($title);
- $m1=[regex]::match($html1,'<div id="danji"[^>]*?>([\s\S]+?)</div></div></div>');
- if($m1.Success){
- $m2=[regex]::matches($m1.groups[1].value, '<a class="link" href="([^"]+?)">(\d+)</a>');
- for($j=$m2.Count-1;$j -ge 0;$j--){
- $sw.WriteLine('第'+$m2[$j].groups[2].value+'集');
- $url2=($url -replace '^(https?://[^/]+).+','$1')+$m2[$j].groups[1].value;
- write-host ($m2[$j].groups[2].value+' '+$url2);
- $html2=gethtml $url2;
- $m3=[regex]::match($html2,'<div>.*?<center>.*?</center>.*?<br>([\s\S]+?)<div');
- if($m3.Success){
- $contant=(($m3.groups[1].value -replace '<p>','') -replace ' ',' ') -replace '</p>',"`r`n";
- $contant=$contant.trim();
- $sw.WriteLine($contant);
- $sw.Flush();
- }
- }
- }
- $sw.WriteLine('');
- }
- $sw.Close();
- $fs.Close();
复制代码
作者: zaqmlp 时间: 2021-10-10 19:32
本帖最后由 zaqmlp 于 2021-10-11 16:21 编辑
- <# :
- rem www.tvmao.com
- rem 另存为ANSI编码的bat
- cls&echo off&cd /d "%~dp0"
- powershell -NoProfile -ExecutionPolicy bypass "[IO.File]::ReadAllText('%~f0',[Text.Encoding]::GetEncoding('GB2312'))|Invoke-Expression"
- pause
- exit
- #>
-
- $starturl=1;
- $minilen=160;
- $timeout=900;
-
- $infile='.\文本b.txt';
- $outfile='.\结果.txt';
-
- if(-not (test-path -liter $infile)){write-host ('"'+$infile+'" 未找到');exit;};
-
- $enc=[System.Text.Encoding]::UTF8;
- $webclient=New-Object System.Net.WebClient;
- function gethtml($u){
- $t='';
- for($j=1;$j -le 3;$j++){
- try{
- $webclient.Headers.Add('User-Agent','Mozilla/5.0 (Windows NT 6.1; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36');
- $webclient.Encoding=$enc;
- $t=$webclient.DownloadString($u);
- break;
- }catch{
- write-host ('第'+$j.toString()+'次获取网页源码失败');
- start-sleep -Seconds 3;
- };
- };
- return $t;
- };
-
- function getcontent($u){
- $content='';
- $html2=gethtml $u;
- $m3=[regex]::match($html2,'<article class="clear epi_c"[^>]*?>([\s\S]+?)</article>');
- if($m3.Success){
- $content=((($m3.groups[1].value -replace '<p>','') -replace ' ',' ') -replace '</p>',"`r`n") -replace '<div(?: [^>]*?)?>[\s\S]+?</div>','';
- $content=(($content -replace '<[^>]+?>','') -replace '</?[a-z]+','').trim();
- }
- return $content;
- }
-
- $fs=New-Object System.IO.FileStream($outfile, [System.IO.FileMode]::Append);
- $sw=New-Object System.IO.StreamWriter($fs, $enc);
- $text=[IO.File]::ReadAllText($infile, $enc).split("`r`n",[StringSplitOptions]::RemoveEmptyEntries);
- for($i=0;$i -lt $text.Count;$i++){
- if(($i+1) -ge $starturl){
- write-host ('--------------'+$text[$i]+'--------------');
- $result=New-Object -TypeName System.Collections.ArrayList;
- $url=$text[$i].trimend(' /')
- $url1=$url+'/episode';
- $html1=gethtml $url1;
- $title='TTT';
- $m1=[regex]::match($html1,'title="([^"]+?)剧情介绍".*?>介绍');
- if($m1.Success){$title=$title+$m1.groups[1].value.replace('剧情介绍','');}
- write-host $title;
- [void]$result.add($title);
- $m1=[regex]::match($html1,'<div class="epipage clear">([\s\S]+?)</div>');
- if($m1.Success){
- $m2=[regex]::matches($m1.groups[1].value, '<li(?: [^>]*?)?>.*?href="([^"]+?)"[^>]*?>(\d+)</a></li>');
- for($j=0;$j -lt $m2.Count;$j++){
- [void]$result.add('第'+$m2[$j].groups[2].value+'集');
- $url2=($url -replace '^(https?://[^/]+).+','$1')+$m2[$j].groups[1].value;
- write-host ($m2[$j].groups[2].value+' '+$url2);
- $t=0;
- while(1){
- $ct=getcontent $url2;
- write-host $ct.length;
- if($ct.length -gt $minilen){
- [void]$result.add($ct);
- break;
- }else{
- $t++;
- }
- if($t -ge 3){
- $t=0;
- write-host ('字符数少于'+$minilen+',暂停'+$timeout+'秒……');
- start-sleep -Seconds $timeout;
- }
- }
- }
- }
- $s=$result -join "`r`n";
- $sw.WriteLine($s);
- $sw.WriteLine('');
- $sw.Flush();
- }
- }
- $sw.Close();
- $fs.Close();
复制代码
作者: went 时间: 2021-10-10 19:53
tvzn.bat- #&cls&@cd /d "%~dp0" & powershell -c "Get-Content '%~0' | Select-Object -Skip 1 | Out-String | Invoke-Expression" &pause&exit
- cls
- #whr
- $whr = New-Object -ComObject 'WinHttp.WinHttpRequest.5.1'
-
- #变量设置
- $list_file = 'c.txt' #电视剧名称列表文件
- $out_file = 'info.txt' #输出信息文件
-
- #函数 获取所有分集链接
- function Get-PartList($url) {
- $url += 'juqingjieshao.html'
- $whr.Open('GET',$url,$false)
- $whr.Send()
- $txt = $whr.ResponseText
- $whr.Abort()
- $title = ''
- if($txt -match '(?s)<title>(.*?)</title>'){
- $title = $Matches[1] -replace '剧情介绍.*$',''
- }
- return &{
- $title
- [regex]::Matches($txt,'<a class="link" href="(.*?)">(.*?)</a>') | Sort-Object {[int]$_.Groups[2].Value} | foreach {
- $_.Groups[2].Value + '|' + 'https://www.tvzn.com' + $_.Groups[1].Value
- }
- }
- }
-
- #函数 根据分集链接获取剧情内容
- function Get-PartInfo($part_list){
- #标题
- $title = $part_list[0]
- Write-Host $title
- 'TTT' + $title
- #分集
- if($part_list.Count -gt 1){
- for($i = 1; $i -lt $part_list.Count; $i++){
- $arr = $part_list[$i] -split '\|'
- Write-Host (' 第{0}集 {1}' -f $arr[0],$arr[1])
- #---------------------------------
- $url = $arr[1]
- $whr.Open('GET',$url,$false)
- $whr.Send()
- $txt = $whr.ResponseText
- $whr.Abort()
- if($txt -match '(?s)<div class="tn-detail-text".*?</p>'){
- '第{0}集' -f $arr[0]
- $Matches[0] -replace '<.*?>| |\s+',''
- }
- }
- }
- }
-
- #读取文本内容并解析
- Get-Content $list_file | foreach {
- $part_list = Get-PartList -url $_
- $data = Get-PartInfo -part_list $part_list
- Out-File -InputObject $data $out_file -Append
- }
复制代码
作者: lxh623 时间: 2021-10-11 07:51
回复 3# zaqmlp
电视猫
信息不全,后面总是少了。麻烦检查一下。
谢谢!
作者: went 时间: 2021-10-11 11:09
tvmao.bat- #&cls&@cd /d "%~dp0" & powershell -c "Get-Content '%~0' | Select-Object -Skip 1 | Out-String | Invoke-Expression" &pause&exit
- cls
- #whr
- $whr = New-Object -ComObject 'WinHttp.WinHttpRequest.5.1'
-
- #设置请求头
- function Set-RequestHeader($whr){
- $whr.SetRequestHeader('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38');
- $whr.SetRequestHeader('Host','www.tvmao.com')
- $whr.SetRequestHeader('Connection','Keep-Alive')
- }
-
- #函数 根据url链接获取所有分集的链接
- function Get-PartUrlList($url){
- if($url -eq $null){
- return $null
- }
- $whr.Open('GET',$url,$false)
- Set-RequestHeader -whr $whr
- $whr.Send()
- $txt = $whr.ResponseText
- $whr.Abort()
- $title = ''
- $pattern = '<strong class="font24">(.*?)</strong>'
- if($txt -match $pattern){
- $title = $Matches[1]
- }
- $pattern = '<div class="epipage.*?>.*?</div>'
- if($txt -match $pattern){
- $pattern = '(?s)href="(.*?)"'
- return &{
- $title
- [regex]::Matches($Matches[0],$pattern) | foreach {
- 'https://www.tvmao.com' + $_.Groups[1].Value
- }
- }
- }
- return $null
- }
-
- #函数 根据分集url获取分集剧情
- function Get-PartInfo($part_url){
- if($part_url -match '-(\d+)$'){
- Write-Host ('解析第{0}集 {1}' -f $Matches[1],$part_url)
- '第{0}集' -f $Matches[1]
- } else {
- return;
- }
- $whr.Open('GET',$part_url,$false)
- Set-RequestHeader -whr $whr
- $whr.Send()
- $txt = $whr.ResponseText
- $whr.Abort()
- $pattern = '(?s)<article class=".*?epi_c">.*?</article>'
- if($txt -match $pattern){
- $Matches[0] -replace '(?s)<.*?>','' -replace '(?s)\(.*?转载许可\).*$','' -replace '^\s+|\s+$',''
- }
- }
-
- #--------------------------------------------------------------------
- #变量设置
- $list_file = 'd.txt' #列表文件
- $out_file = 'tvmao_info.txt' #输出信息文件
- #--------------------------------------------------------------------
-
- Get-Content $list_file | foreach {
- $part_list = Get-PartUrlList -url ($_ + '/episode')
- Write-Host $part_list[0]
- if($part_list.Count -gt 1){
- $data = &{
- 'TTT' + $part_list[0]
- for($i = 1;$i -lt $part_list.Count; $i++){
- Get-PartInfo -part_url $part_list[$i]
- }
- }
- Out-File -InputObject $data $out_file -Append
- }
- Write-Host '----------------'
- }
复制代码
作者: zaqmlp 时间: 2021-10-11 16:23
回复 5# lxh623
已修改
欢迎光临 批处理之家 (http://bbs.bathome.net/) |
Powered by Discuz! 7.2 |