批处理之家 - Powered by Discuz! Board

.. http://cmd1152.ys168.com/ 文件区下载 "lxh623已经提取好的资料.zip"
七猫影视-提取.html
七猫影视-提取.xls
定制说明.txt
搜视网-提取.html
搜视网-提取.xls
文本格式
新站到-提取.html
新站到-提取.xls
附赠提取工具

复制代码

@echo off
REM 设置提取网址
set "url=https://www.tvsou.com/class/dianshiju/--"
REM 设置网页数量
set "num=100"
REM 开始提起网页
for /l %%a in (1,1,%num%) do (
echo;正在下载第 %%a 个页面
curl "%url%--%%a" > home\%%a.txt
echo;正在转码第 %%a 个页面
wincp home\%%a.txt -o ansi\N_%%a.txt -f 65001 -t 936
)
REM 直接输出提取后的信息
(
for /l %%a in (1,1,%num%) do (
zq ansi\N_%%a.txt
)
) > out.html
pause

复制代码

#&cls&@cd /d "%~dp0" & powershell -c "Get-Content '%~0' | Select-Object -Skip 1 | Out-String | Invoke-Expression" &pause&exit
cls
#whr
$whr = New-Object -ComObject 'WinHttp.WinHttpRequest.5.1'
#函数获取一页列表
function Get-PageList($page){
$url = 'https://www.km.com/tv/-----{0}.html' -f $page
$whr.Open('GET',$url,$false)
$whr.Send()
if($whr.Status -eq 200){
$txt = $whr.ResponseText
$whr.Abort()
$pattern = '(?<=<div class="filter_res_image scale_image_container">\s+<a href=").*?(?=")'
[regex]::Matches($txt,$pattern) | foreach {
if($_.Value -match '/(\d+).html$'){
$Matches[1]
}
}
}
}
#函数获取主要信息
function Get-MainInfo($key){
#主要信息界面
$url = 'https://www.km.com/tv/{0}.html' -f $key
$whr.Open('GET',$url,$false)
$whr.Send()
$txt = $whr.ResponseText
$whr.Abort()
#标题
$pattern = '(?s)<div class="inside-title.*?>.*?>(.*?)<'
if($txt -match $pattern){
'名称: TTT' + $Matches[1]
Write-Host ('解析电视剧: ' + $Matches[1])
}
#图片地址
$pattern = '(?s)<div class="video_poster.*?">.*?<img src="(.*?)"'
if($txt -match $pattern){
'图片地址: ''https:' + $Matches[1]
}
#概览
"`r`n【概览】"
$pattern = '(?s)mr20">(.*?)<a class="unfold"'
if($txt -match $pattern){
$t = $Matches[1] -replace '(?s)<script>.*?</script>','' -replace '<.*?>','' -replace '\s{2,}',''
$t -replace '(导演：)|(主演：)|(地区/类型：)|(剧情集数：)|(播出时间：)|(在线观看网站：)|(别名：)|(片长：)',("`r`n" + '$0')
}
#分数
$pattern = '(?s)<p class="score_num.*?>.*?>(.*?)<'
if($txt -match $pattern){
'综合评分: ' + $Matches[1]
}
#幕后信息
"`r`n【幕后信息】"
$pattern = '(?s)幕后信息</div>(.*?)<a class="intro_fold'
if($txt -match $pattern){
$Matches[1] -replace '<.*?>','' -replace '\s{2,}','' -replace '(编剧：)|(制片人：)|(TV首播时间：)|(在线播放平台：)|(出品公司：)|《',("`r`n" + '$0')
}
#演员表
"`r`n【演员表】"
$url = 'https://www.km.com/tv/yanyuan/{0}.html' -f $key
$whr.Open('GET',$url,$false)
$whr.Send()
$txt = $whr.ResponseText
$whr.Abort()
$pattern = '(?s)<div class="actor-list-detail same_col">.*?<div class="actor-works same_col_bottom">'
[regex]::Matches($txt,$pattern) | foreach {
'ZZZ' + ($_.Value -replace '<.*?>','' -replace '\s{2,}','' -replace '饰演'," 饰演 ")
}
#人物介绍
"`r`n【人物介绍】"
$url = 'https://www.km.com/tv/role/{0}.html' -f $key
$whr.Open('GET',$url,$false)
$whr.Send()
$txt = $whr.ResponseText
$whr.Abort()
$pattern = '(?s)<div class="role-name">.*?<div class="role-intro-js">'
[regex]::Matches($txt,$pattern) | foreach {
$_.Value -replace '<.*?>','' -replace '\s{2,}','' -replace '演员',' 演员' -replace '-->简介：',"-->简介：`r`n`t"
}
}
#函数一页数据保存为一个文件
function Save-PageInfo($page){
#如果本页已解析,则跳过
$f = 'page_{0:000}.txt' -f $page
if([System.IO.File]::Exists($f)){
'第{0}页已解析,跳过...' -f $page
return;
}
#开始解析
&{
Get-PageList -page 1 | foreach {
Get-MainInfo -key $_
'----------------------------------'
}
} | Out-File $f
}
#解析并保存第一页数据
Save-PageInfo -page 1

复制代码

https://www.km.com/tv/98324.html
https://www.km.com/tv/96830.html
https://www.km.com/tv/56848.html
https://www.km.com/tv/97869.html
https://www.km.com/tv/94010.html
https://www.km.com/tv/97924.html
https://www.km.com/tv/96880.html
https://www.km.com/tv/99023.html
https://www.km.com/tv/58624.html
https://www.km.com/tv/95650.html
https://www.km.com/tv/96842.html
https://www.km.com/tv/97604.html
https://www.km.com/tv/94180.html
https://www.km.com/tv/99840.html
https://www.km.com/tv/93130.html
https://www.km.com/tv/57571.html
https://www.km.com/tv/98348.html
https://www.km.com/tv/97183.html
https://www.km.com/tv/99770.html
https://www.km.com/tv/61631.html
https://www.km.com/tv/97788.html
https://www.km.com/tv/95876.html
https://www.km.com/tv/97131.html
https://www.km.com/tv/58078.html

复制代码

#&cls&@cd /d "%~dp0" & powershell -c "Get-Content '%~0' | Select-Object -Skip 1 | Out-String | Invoke-Expression" &pause&exit
cls
#whr
$whr = New-Object -ComObject 'WinHttp.WinHttpRequest.5.1'
#函数获取主要信息
function Get-MainInfo($key){
#主要信息界面
$url = 'https://www.km.com/tv/{0}.html' -f $key
$whr.Open('GET',$url,$false)
$whr.Send()
$txt = $whr.ResponseText
$whr.Abort()
#标题
$pattern = '(?s)<div class="inside-title.*?>.*?>(.*?)<'
if($txt -match $pattern){
'名称: TTT' + $Matches[1]
Write-Host ('解析电视剧: ' + $Matches[1])
}
#图片地址
$pattern = '(?s)<div class="video_poster.*?">.*?<img src="(.*?)"'
if($txt -match $pattern){
'图片地址: https:' + $Matches[1]
}
#概览
"`r`n【概览】"
$pattern = '(?s)mr20">(.*?)<a class="unfold"'
if($txt -match $pattern){
$t = $Matches[1] -replace '(?s)<script>.*?</script>','' -replace '<.*?>','' -replace '\s{2,}',''
$t -replace '(导演：)|(主演：)|(地区/类型：)|(剧情集数：)|(播出时间：)|(在线观看网站：)|(别名：)|(片长：)',("`r`n" + '$0') -replace "主演：.*?`r`n",''
}
#分数
$pattern = '(?s)<p class="score_num.*?>.*?>(.*?)<'
if($txt -match $pattern){
'综合评分: ' + $Matches[1]
}
#获取剧情介绍内容和总集数
$plot_main = ''
$pattern = '剧情介绍：.*?>(.*?)<'
if($txt -match $pattern){
$plot_main = $Matches[1]
}
$pattern = '>\(全部 (\d+)<'
if($txt -match $pattern){
$part = $Matches[1]
}
#幕后信息
"`r`n【幕后信息】"
$pattern = '(?s)幕后信息</div>(.*?)<a class="intro_fold'
if($txt -match $pattern){
$Matches[1] -replace '<.*?>','' -replace '\s{2,}','' -replace '(编剧：)|(制片人：)|(TV首播时间：)|(在线播放平台：)|(出品公司：)|《',("`r`n" + '$0')
}
#演员表
"`r`n【演员表】"
$url = 'https://www.km.com/tv/yanyuan/{0}.html' -f $key
$whr.Open('GET',$url,$false)
$whr.Send()
$txt = $whr.ResponseText
$whr.Abort()
$pattern = '(?s)<div class="actor-list-detail same_col">.*?<div class="actor-list-detail same_col">'
[regex]::Matches($txt,$pattern) | foreach {
if($_.Value.Contains('<em>饰</em><em>演</em>')){
'ZZZ' + $_.Value -replace '<.*?>|\s{2,}',''-replace '饰演'," 饰演 " -replace '最近作品：.*$',''
}
}
#人物介绍
"`r`n【人物介绍】"
$url = 'https://www.km.com/tv/role/{0}.html' -f $key
$whr.Open('GET',$url,$false)
$whr.Send()
$txt = $whr.ResponseText
$whr.Abort()
$pattern = '(?s)<div class="role-name">.*?<div class="role-intro-js">'
[regex]::Matches($txt,$pattern) | foreach {
$_.Value -replace '<.*?>','' -replace '\s{2,}','' -replace '演员',' 演员' -replace '-->简介：',"-->简介：`r`n`t"
}
#解析分集剧情
Get-PartInfo -key $key -title $plot_main -max $part
'-----------------------------------------------------------------------------'
}
#函数解析分集剧情
function Get-PartInfo($key,$title,$max){
"`r`n【分集剧情】"
"`t剧情介绍`n`t`t" + $title
for($i = 1; $i -le $max; $i++){
"`t第{0}集`t`t" -f $i
$url = 'https://www.km.com/tv/{0}/2_{1}.html' -f $key,$i
#读取网页内容
$whr.Open('GET',$url,$false)
$whr.Send()
$txt = $whr.ResponseText
$whr.Abort()
#解析剧情内容
$pattern = '(?s)<div class="article-content">.*?</div>'
if($txt -match $pattern){
"`t`t" + ($Matches[0] -replace '<.*?>','' -replace '\s{2,}',' ').Trim()
}
}
}
#变量设置
$cur_file = 'pass.txt' #保存已读取的个数
$url_file = 'a.txt' #URL列表文件
$out_file = 'info.txt' #输出信息文件
#读取已经解析的个数
$cur = 0
if([System.IO.File]::Exists($cur_file)){
$line = Get-Content $cur_file | Select-Object -First 1
if($line -match '\d+'){
$cur = [int]$line
}
}
Write-Host ('上次解析个数: {0}' -f $cur)
#读取url并解析内容
Get-Content $url_file | Select-Object -Skip $cur | foreach {
if($_ -match 'https://www.km.com/tv/(\d+).html'){
#解析内容
Get-MainInfo -key $Matches[1] | Out-File $out_file -Append
#保存当前进度
++$cur
Out-File -InputObject $cur -FilePath $cur_file
}
}

复制代码

夺金
光芒
一生一世
叛逆者
火红年华
国子监来了个女弟子
君九龄
周生如故
乔家的儿女
扫黑风暴
程序员那么可爱
功勋
夺金战
当爱情遇上科学家
双刺
燃烧大地
启航：当风起时
你好检察官
我哥我嫂
理想之城

复制代码

#&cls&@cd /d "%~dp0" & powershell -c "Get-Content '%~0' | Select-Object -Skip 1 | Out-String | Invoke-Expression" &pause&exit
cls
#whr
$whr = New-Object -ComObject 'WinHttp.WinHttpRequest.5.1'
#设置请求头
function Set-RequestHeader($whr){
$whr.SetRequestHeader('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38');
$whr.SetRequestHeader('Host','www.tvmao.com')
$whr.SetRequestHeader('Connection','Keep-Alive')
}
#函数根据关键字获取url链接
function Get-Url($kw){
$url = 'https://www.tvmao.com/servlet/queryobject?type=all&term=' + [System.Net.WebUtility]::UrlEncode($kw)
$whr.Open('GET',$url,$false)
Set-RequestHeader -whr $whr
$whr.Send()
$txt = $whr.ResponseText
$whr.Abort()
if($txt -match '"url":"(.+?)"'){
return 'https://www.tvmao.com' + $Matches[1] + '/episode'
}
return $null
}
#函数根据url链接获取所有分集的链接
function Get-PartUrlList($url){
if($url -eq $null){
return $null
}
$whr.Open('GET',$url,$false)
Set-RequestHeader -whr $whr
$whr.Send()
$txt = $whr.ResponseText
$whr.Abort()
$pattern = '<div class="epipage.*?>.*?</div>'
if($txt -match $pattern){
$pattern = '(?s)href="(.*?)"'
return &{
[regex]::Matches($Matches[0],$pattern) | foreach {
'https://www.tvmao.com' + $_.Groups[1].Value
}
}
}
return $null
}
#函数根据分集url获取分集剧情
function Get-PartInfo($part_url){
if($part_url -match '-(\d+)$'){
Write-Host ('解析第{0}集 {1}' -f $Matches[1],$part_url)
'第{0}集' -f $Matches[1]
} else {
return;
}
$whr.Open('GET',$part_url,$false)
Set-RequestHeader -whr $whr
$whr.Send()
$txt = $whr.ResponseText
$whr.Abort()
$pattern = '(?s)<article class=".*?epi_c">.*?</article>'
if($txt -match $pattern){
$Matches[0] -replace '(?s)<.*?>','' -replace '(?s)$.*?转载许可$.*$',''
}
}
#变量设置
$cur_file = 'pass.txt' #保存已读取的个数
$list_file = 'b.txt' #电视剧名称列表文件
$out_file = 'info.txt' #输出信息文件
#读取已经解析的个数
$cur = 0
if([System.IO.File]::Exists($cur_file)){
$line = Get-Content $cur_file | Select-Object -First 1
if($line -match '\d+'){
$cur = [int]$line
}
}
Write-Host ('上次解析个数: {0}' -f $cur)
#读取电视剧名称并解析内容
Get-Content $list_file | Select-Object -Skip $cur | foreach {
$url = Get-Url -kw $_.Trim()
if($url -ne $null){
Write-Host ('正在解析 {0} {1}' -f $_,$url)
Write-Host '---'
$info = &{
'名称: {0}' -f $_.Trim()
Get-PartUrlList -url $url | foreach {
Get-PartInfo -part_url $_
}
'--------------------------------------------------'
}
Out-File -FilePath $out_file -InputObject $info -Append
Write-Host '--------------------------------------------------'
}
++$cur
Out-File -FilePath $cur_file -InputObject $cur
}

复制代码

#&cls&@cd /d "%~dp0" & powershell -c "Get-Content '%~0' | Select-Object -Skip 1 | Out-String | Invoke-Expression" &pause&exit
cls
#whr
$whr = New-Object -ComObject 'WinHttp.WinHttpRequest.5.1'
#设置请求头
function Set-RequestHeader($whr){
$whr.SetRequestHeader('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38');
$whr.SetRequestHeader('Host','www.tvmao.com')
$whr.SetRequestHeader('Connection','Keep-Alive')
}
#函数根据关键字获取url链接
function Get-Url($kw){
$url = 'https://www.tvmao.com/servlet/queryobject?type=all&term=' + [System.Net.WebUtility]::UrlEncode($kw)
$whr.Open('GET',$url,$false)
Set-RequestHeader -whr $whr
$whr.Send()
$txt = $whr.ResponseText
$whr.Abort()
if($txt -match '"url":"(.+?)"'){
return 'https://www.tvmao.com' + $Matches[1] + '/renwuguanxitu'
}
return $null
}
#函数根据url获取人物关系图
function Get-GXT($url){
$whr.Open('GET',$url,$false)
Set-RequestHeader -whr $whr
$whr.Send()
$txt = $whr.ResponseText
$whr.Abort()
if($txt -match '(?s)<div class="d_guanxi_img".*?>.*?<img src="(.*?/guanxitu/.*?)"'){
return $Matches[1]
}
return $null
}
#变量设置
$cur_file = 'pass.txt' #保存已读取的个数
$list_file = 'c.txt' #电视剧名称列表文件
$out_file = 'info.txt' #输出信息文件
#读取已经解析的个数
$cur = 0
if([System.IO.File]::Exists($cur_file)){
$line = Get-Content $cur_file | Select-Object -First 1
if($line -match '\d+'){
$cur = [int]$line
}
}
Write-Host ('上次解析个数: {0}' -f $cur)
#读取电视剧名称并解析内容
Get-Content $list_file | Select-Object -Skip $cur | foreach {
$url = Get-Url -kw $_.Trim()
if($url -ne $null){
Write-Host ('正在解析 {0}' -f $_)
$png_url = Get-GXT -url $url
if($png_url -eq $null){
'nothing'
} else {
$info = 'TTT{0}YYY人物关系图: {1}' -f ($_.Trim(),$png_url)
Out-File -FilePath $out_file -InputObject $info -Append
}
Write-Host '--------------------------------------------------'
Start-Sleep -Milliseconds 500
}
++$cur
Out-File -FilePath $cur_file -InputObject $cur
}

复制代码