【完结】求网页信息抓取+150元 - 有偿求助 - 批处理之家 BAT,CMD,批处理,PowerShell,VBS,DOS

Rank: 6 Rank: 6

帖子: 864
积分: 1738
技术: 229
捐助: 0
注册时间: 2017-9-22

1楼 跳转到 »

发表于 2021-10-4 12:08 | 显示全部帖子

 #&cls&@cd /d "%~dp0" & powershell -c "Get-Content '%~0' | Select-Object -Skip 1 | Out-String | Invoke-Expression" &pause&exit
cls
#whr 
$whr = New-Object -ComObject 'WinHttp.WinHttpRequest.5.1'
 
#函数 获取一页列表
function Get-PageList($page){
    $url = 'https://www.km.com/tv/-----{0}.html' -f $page
    $whr.Open('GET',$url,$false)
    $whr.Send()
    if($whr.Status -eq 200){
        $txt = $whr.ResponseText
        $whr.Abort()
        $pattern = '(?<=<div class="filter_res_image scale_image_container">\s+<a href=").*?(?=")'
        [regex]::Matches($txt,$pattern) | foreach { 
            if($_.Value -match '/(\d+).html$'){
                $Matches[1]
            }
        }
    }
}
 
#函数 获取主要信息
function Get-MainInfo($key){
    #主要信息界面
    $url = 'https://www.km.com/tv/{0}.html' -f $key
    $whr.Open('GET',$url,$false)
    $whr.Send()
    $txt = $whr.ResponseText
    $whr.Abort()
 
    #标题
    $pattern = '(?s)<div class="inside-title.*?>.*?>(.*?)<'
    if($txt -match $pattern){
        '名称: TTT' + $Matches[1]
        Write-Host ('解析电视剧: ' + $Matches[1])
    }
    #图片地址
    $pattern = '(?s)<div class="video_poster.*?">.*?<img src="(.*?)"'
    if($txt -match $pattern){
        '图片地址: ''https:' + $Matches[1]
    }
    #概览
    "`r`n【概览】"
    $pattern = '(?s)mr20">(.*?)<a class="unfold"'
    if($txt -match $pattern){
        $t = $Matches[1] -replace '(?s)<script>.*?</script>','' -replace '<.*?>','' -replace '\s{2,}','' 
        $t -replace '(导演：)|(主演：)|(地区/类型：)|(剧情集数：)|(播出时间：)|(在线观看网站：)|(别名：)|(片长：)',("`r`n" + '$0')
    }
 
    #分数
    $pattern = '(?s)<p class="score_num.*?>.*?>(.*?)<'
    if($txt -match $pattern){
        '综合评分: ' + $Matches[1]
    }
 
    #幕后信息
    "`r`n【幕后信息】"
    $pattern = '(?s)幕后信息</div>(.*?)<a class="intro_fold'
    if($txt -match $pattern){
        $Matches[1] -replace '<.*?>','' -replace '\s{2,}','' -replace '(编剧：)|(制片人：)|(TV首播时间：)|(在线播放平台：)|(出品公司：)|《',("`r`n" + '$0')
    }
 
    #演员表
   "`r`n【演员表】"
    $url = 'https://www.km.com/tv/yanyuan/{0}.html' -f $key
    $whr.Open('GET',$url,$false)
    $whr.Send()
    $txt = $whr.ResponseText
    $whr.Abort()
 
 
    $pattern = '(?s)<div class="actor-list-detail same_col">.*?<div class="actor-works same_col_bottom">'
    [regex]::Matches($txt,$pattern) | foreach {
        'ZZZ' + ($_.Value -replace '<.*?>','' -replace '\s{2,}','' -replace '饰演'," 饰演 ")
    }
 
    #人物介绍
	"`r`n【人物介绍】"
    $url = 'https://www.km.com/tv/role/{0}.html' -f $key
    $whr.Open('GET',$url,$false)
    $whr.Send()
    $txt = $whr.ResponseText
    $whr.Abort()
    $pattern = '(?s)<div class="role-name">.*?<div class="role-intro-js">'
    [regex]::Matches($txt,$pattern) | foreach {
        $_.Value -replace '<.*?>','' -replace '\s{2,}','' -replace '演员',' 演员' -replace '-->简介：',"-->简介：`r`n`t"
    }
}
 
#函数 一页数据保存为一个文件
function Save-PageInfo($page){
    #如果本页已解析,则跳过
    $f = 'page_{0:000}.txt' -f $page
    if([System.IO.File]::Exists($f)){
        '第{0}页已解析,跳过...' -f $page
        return;
    }
    #开始解析
    &{
        Get-PageList -page 1 | foreach {
            Get-MainInfo -key $_
            '----------------------------------'
        }
    } | Out-File $f
}
 
#解析并保存第一页数据
Save-PageInfo -page 1COPY

1 评分人数

lxh623: 乐于助人技术 + 1

https://docs.qq.com/doc/DSVJ0d094Q1NtbGta

TOP

went

少校

Rank: 6 Rank: 6

帖子: 864
积分: 1738
技术: 229
捐助: 0
注册时间: 2017-9-22

2楼

发表于 2021-10-4 18:01 | 显示全部帖子

回复 9# lxh623

地址你保存的这种格式?

 https://www.km.com/tv/98324.html
https://www.km.com/tv/96830.html
https://www.km.com/tv/56848.html
https://www.km.com/tv/97869.html
https://www.km.com/tv/94010.html
https://www.km.com/tv/97924.html
https://www.km.com/tv/96880.html
https://www.km.com/tv/99023.html
https://www.km.com/tv/58624.html
https://www.km.com/tv/95650.html
https://www.km.com/tv/96842.html
https://www.km.com/tv/97604.html
https://www.km.com/tv/94180.html
https://www.km.com/tv/99840.html
https://www.km.com/tv/93130.html
https://www.km.com/tv/57571.html
https://www.km.com/tv/98348.html
https://www.km.com/tv/97183.html
https://www.km.com/tv/99770.html
https://www.km.com/tv/61631.html
https://www.km.com/tv/97788.html
https://www.km.com/tv/95876.html
https://www.km.com/tv/97131.html
https://www.km.com/tv/58078.htmlCOPY

1 评分人数

lxh623: 感谢分享技术 + 1

https://docs.qq.com/doc/DSVJ0d094Q1NtbGta

TOP

went

少校

Rank: 6 Rank: 6

帖子: 864
积分: 1738
技术: 229
捐助: 0
注册时间: 2017-9-22

3楼

发表于 2021-10-4 18:19 | 显示全部帖子

本帖最后由 went 于 2021-10-4 18:34 编辑

url保存为楼上格式,a.txt
#变量设置
$cur_file = 'pass.txt' #保存已读取的个数
$url_file = 'a.txt' #URL列表文件
$out_file = 'info.txt' #输出信息文件

 #&cls&@cd /d "%~dp0" & powershell -c "Get-Content '%~0' | Select-Object -Skip 1 | Out-String | Invoke-Expression" &pause&exit
cls
#whr 
$whr = New-Object -ComObject 'WinHttp.WinHttpRequest.5.1'
 
#函数 获取主要信息
function Get-MainInfo($key){
    #主要信息界面
    $url = 'https://www.km.com/tv/{0}.html' -f $key
    $whr.Open('GET',$url,$false)
    $whr.Send()
    $txt = $whr.ResponseText
    $whr.Abort()
 
    #标题
    $pattern = '(?s)<div class="inside-title.*?>.*?>(.*?)<'
    if($txt -match $pattern){
        '名称: TTT' + $Matches[1]
        Write-Host ('解析电视剧: ' + $Matches[1])
    }
    #图片地址
    $pattern = '(?s)<div class="video_poster.*?">.*?<img src="(.*?)"'
    if($txt -match $pattern){
        '图片地址: https:' + $Matches[1]
    }
    #概览
    "`r`n【概览】"
    $pattern = '(?s)mr20">(.*?)<a class="unfold"'
    if($txt -match $pattern){
        $t = $Matches[1] -replace '(?s)<script>.*?</script>','' -replace '<.*?>','' -replace '\s{2,}','' 
        $t -replace '(导演：)|(主演：)|(地区/类型：)|(剧情集数：)|(播出时间：)|(在线观看网站：)|(别名：)|(片长：)',("`r`n" + '$0') -replace "主演：.*?`r`n",''
    }
 
    #分数
    $pattern = '(?s)<p class="score_num.*?>.*?>(.*?)<'
    if($txt -match $pattern){
        '综合评分: ' + $Matches[1]
    }
    #获取剧情介绍内容和总集数
    $plot_main = ''
    $pattern = '剧情介绍：.*?>(.*?)<'
    if($txt -match $pattern){
        $plot_main = $Matches[1]
    }
    $pattern = '>\(全部&nbsp;(\d+)<'
    if($txt -match $pattern){
        $part = $Matches[1]
    }
    
    #幕后信息
    "`r`n【幕后信息】"
    $pattern = '(?s)幕后信息</div>(.*?)<a class="intro_fold'
    if($txt -match $pattern){
        $Matches[1] -replace '<.*?>','' -replace '\s{2,}','' -replace '(编剧：)|(制片人：)|(TV首播时间：)|(在线播放平台：)|(出品公司：)|《',("`r`n" + '$0')
    }
 
    #演员表
   "`r`n【演员表】"
    $url = 'https://www.km.com/tv/yanyuan/{0}.html' -f $key
    $whr.Open('GET',$url,$false)
    $whr.Send()
    $txt = $whr.ResponseText
    $whr.Abort()
    $pattern = '(?s)<div class="actor-list-detail same_col">.*?<div class="actor-list-detail same_col">'
    [regex]::Matches($txt,$pattern) | foreach {
        if($_.Value.Contains('<em>饰</em><em>演</em>')){
            'ZZZ' + $_.Value -replace '<.*?>|\s{2,}',''-replace '饰演'," 饰演 " -replace '最近作品：.*$',''
        }
    }
 
    #人物介绍
   "`r`n【人物介绍】"
    $url = 'https://www.km.com/tv/role/{0}.html' -f $key
    $whr.Open('GET',$url,$false)
    $whr.Send()
    $txt = $whr.ResponseText
    $whr.Abort()
    $pattern = '(?s)<div class="role-name">.*?<div class="role-intro-js">'
    [regex]::Matches($txt,$pattern) | foreach {
        $_.Value -replace '<.*?>','' -replace '\s{2,}','' -replace '演员',' 演员' -replace '-->简介：',"-->简介：`r`n`t"
    }
 
    #解析分集剧情
    Get-PartInfo -key $key -title $plot_main -max $part
    '-----------------------------------------------------------------------------'
}
 
#函数 解析分集剧情
function Get-PartInfo($key,$title,$max){
	"`r`n【分集剧情】"
    "`t剧情介绍`n`t`t" + $title
    for($i = 1; $i -le $max; $i++){
        "`t第{0}集`t`t" -f $i
        $url = 'https://www.km.com/tv/{0}/2_{1}.html' -f $key,$i
        #读取网页内容
        $whr.Open('GET',$url,$false)
        $whr.Send()
        $txt = $whr.ResponseText
        $whr.Abort()
        #解析剧情内容
        $pattern = '(?s)<div class="article-content">.*?</div>'
        if($txt -match $pattern){
            "`t`t" + ($Matches[0] -replace '<.*?>','' -replace '\s{2,}',' ').Trim()
        }
    }
}
 
#变量设置
$cur_file = 'pass.txt' #保存已读取的个数 
$url_file = 'a.txt'    #URL列表文件
$out_file = 'info.txt' #输出信息文件
#读取已经解析的个数
$cur = 0
if([System.IO.File]::Exists($cur_file)){
    $line = Get-Content $cur_file | Select-Object -First 1
    if($line -match '\d+'){
        $cur = [int]$line
    }
}
        Write-Host ('上次解析个数: {0}' -f $cur)
#读取url并解析内容
Get-Content $url_file | Select-Object -Skip $cur | foreach {
    if($_ -match 'https://www.km.com/tv/(\d+).html'){
        #解析内容
        Get-MainInfo -key $Matches[1] | Out-File $out_file -Append
        #保存当前进度
        ++$cur
        Out-File -InputObject $cur -FilePath $cur_file
    }
}COPY

1 评分人数

lxh623: 乐于助人技术 + 1

https://docs.qq.com/doc/DSVJ0d094Q1NtbGta

TOP

went

少校

Rank: 6 Rank: 6

帖子: 864
积分: 1738
技术: 229
捐助: 0
注册时间: 2017-9-22

4楼

发表于 2021-10-4 18:35 | 显示全部帖子

回复 12# lxh623

加上了

https://docs.qq.com/doc/DSVJ0d094Q1NtbGta

TOP

went

少校

Rank: 6 Rank: 6

帖子: 864
积分: 1738
技术: 229
捐助: 0
注册时间: 2017-9-22

5楼

发表于 2021-10-5 14:19 | 显示全部帖子

本帖最后由 went 于 2021-10-5 15:54 编辑

回复 14# lxh623

b.txt保存电视剧名称,ansi编码

 夺金
光芒
一生一世
叛逆者
火红年华
国子监来了个女弟子
君九龄
周生如故
乔家的儿女
扫黑风暴
程序员那么可爱
功勋
夺金战
当爱情遇上科学家
双刺
燃烧大地
启航：当风起时
你好检察官
我哥我嫂
理想之城COPY

test.bat脚本文件,ansi编码

 #&cls&@cd /d "%~dp0" & powershell -c "Get-Content '%~0' | Select-Object -Skip 1 | Out-String | Invoke-Expression" &pause&exit
cls
#whr 
$whr = New-Object -ComObject 'WinHttp.WinHttpRequest.5.1'
 
#设置请求头
function Set-RequestHeader($whr){
    $whr.SetRequestHeader('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38');
    $whr.SetRequestHeader('Host','www.tvmao.com')
    $whr.SetRequestHeader('Connection','Keep-Alive')
}
 
#函数 根据关键字获取url链接
function Get-Url($kw){
    $url = 'https://www.tvmao.com/servlet/queryobject?type=all&term=' + [System.Net.WebUtility]::UrlEncode($kw)
    $whr.Open('GET',$url,$false)
    Set-RequestHeader -whr $whr
    $whr.Send()
    $txt = $whr.ResponseText
    $whr.Abort()
    if($txt -match '"url":"(.+?)"'){
        return 'https://www.tvmao.com' + $Matches[1] + '/episode'
    }
    return $null
}
 
#函数 根据url链接获取所有分集的链接
function Get-PartUrlList($url){
    if($url -eq $null){
        return $null
    }
    $whr.Open('GET',$url,$false)
    Set-RequestHeader -whr $whr
    $whr.Send()
    $txt = $whr.ResponseText
    $whr.Abort()
    $pattern = '<div class="epipage.*?>.*?</div>'
    if($txt -match $pattern){
        $pattern = '(?s)href="(.*?)"'
        return &{
            [regex]::Matches($Matches[0],$pattern) | foreach {
                'https://www.tvmao.com' + $_.Groups[1].Value
            }
        }
    }
    return $null
}
 
#函数 根据分集url获取分集剧情
function Get-PartInfo($part_url){
    if($part_url -match '-(\d+)$'){
        Write-Host ('解析第{0}集 {1}' -f $Matches[1],$part_url)
        '第{0}集' -f $Matches[1]
    } else {
		return;
	}
    $whr.Open('GET',$part_url,$false)
    Set-RequestHeader -whr $whr
    $whr.Send()
    $txt = $whr.ResponseText
    $whr.Abort()
    $pattern = '(?s)<article class=".*?epi_c">.*?</article>'
    if($txt -match $pattern){
        $Matches[0] -replace '(?s)<.*?>','' -replace '(?s)\(.*?转载许可\).*$',''
    }
}
 
#变量设置
$cur_file = 'pass.txt' #保存已读取的个数 
$list_file = 'b.txt'   #电视剧名称列表文件
$out_file = 'info.txt' #输出信息文件
#读取已经解析的个数
$cur = 0
if([System.IO.File]::Exists($cur_file)){
    $line = Get-Content $cur_file | Select-Object -First 1
    if($line -match '\d+'){
        $cur = [int]$line
    }
}
Write-Host ('上次解析个数: {0}' -f $cur)
#读取电视剧名称并解析内容
Get-Content $list_file | Select-Object -Skip $cur | foreach {
    $url = Get-Url -kw $_.Trim()
    if($url -ne $null){
		Write-Host ('正在解析 {0} {1}' -f $_,$url)
		Write-Host '---'
        $info = &{
            '名称: {0}' -f $_.Trim()
            Get-PartUrlList -url $url | foreach {
                Get-PartInfo -part_url $_
            }
			'--------------------------------------------------'
        }
        Out-File -FilePath $out_file -InputObject $info -Append
        Write-Host '--------------------------------------------------'
    }
    ++$cur
    Out-File -FilePath $cur_file -InputObject $cur
}COPY

1 评分人数

lxh623: 乐于助人技术 + 1

https://docs.qq.com/doc/DSVJ0d094Q1NtbGta

TOP

went

少校

Rank: 6 Rank: 6

帖子: 864
积分: 1738
技术: 229
捐助: 0
注册时间: 2017-9-22

6楼

发表于 2021-10-7 11:57 | 显示全部帖子

本帖最后由 went 于 2021-10-7 11:59 编辑

 #&cls&@cd /d "%~dp0" & powershell -c "Get-Content '%~0' | Select-Object -Skip 1 | Out-String | Invoke-Expression" &pause&exit
cls
#whr 
$whr = New-Object -ComObject 'WinHttp.WinHttpRequest.5.1' 
 
#设置请求头
function Set-RequestHeader($whr){
    $whr.SetRequestHeader('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38');
    $whr.SetRequestHeader('Host','www.tvmao.com')
    $whr.SetRequestHeader('Connection','Keep-Alive')
}
 
#函数 根据关键字获取url链接
function Get-Url($kw){
    $url = 'https://www.tvmao.com/servlet/queryobject?type=all&term=' + [System.Net.WebUtility]::UrlEncode($kw)
    $whr.Open('GET',$url,$false)
    Set-RequestHeader -whr $whr
    $whr.Send()
    $txt = $whr.ResponseText
    $whr.Abort()
    if($txt -match '"url":"(.+?)"'){
        return 'https://www.tvmao.com' + $Matches[1] + '/renwuguanxitu'
    }
    return $null
}
 
#函数 根据url获取人物关系图
function Get-GXT($url){
    $whr.Open('GET',$url,$false)
    Set-RequestHeader -whr $whr
    $whr.Send()
    $txt = $whr.ResponseText
    $whr.Abort()
    if($txt -match '(?s)<div class="d_guanxi_img".*?>.*?<img src="(.*?/guanxitu/.*?)"'){
        return $Matches[1]
    }
    return $null
}
 
#变量设置
$cur_file = 'pass.txt' #保存已读取的个数 
$list_file = 'c.txt'   #电视剧名称列表文件
$out_file = 'info.txt' #输出信息文件
#读取已经解析的个数
$cur = 0
if([System.IO.File]::Exists($cur_file)){
    $line = Get-Content $cur_file | Select-Object -First 1
    if($line -match '\d+'){
        $cur = [int]$line
    }
}
Write-Host ('上次解析个数: {0}' -f $cur)
#读取电视剧名称并解析内容
Get-Content $list_file | Select-Object -Skip $cur | foreach {
    $url = Get-Url -kw $_.Trim()
    if($url -ne $null){
		Write-Host ('正在解析 {0}' -f $_)
		$png_url = Get-GXT -url $url
        if($png_url -eq $null){
            'nothing'
        } else {
            $info = 'TTT{0}YYY人物关系图: {1}' -f ($_.Trim(),$png_url)
            Out-File -FilePath $out_file -InputObject $info -Append
        }
        Write-Host '--------------------------------------------------'
        Start-Sleep -Milliseconds 500
    }
    ++$cur
    Out-File -FilePath $cur_file -InputObject $cur
}COPY

回复 16# lxh623

1 评分人数

lxh623: 乐于助人技术 + 1

https://docs.qq.com/doc/DSVJ0d094Q1NtbGta

TOP

went

少校

Rank: 6 Rank: 6

帖子: 864
积分: 1738
技术: 229
捐助: 0
注册时间: 2017-9-22

7楼

发表于 2021-10-7 12:04 | 显示全部帖子

回复 17# went

66行时间可以改长点,不然数据容易漏掉, 500 = 0.5秒

https://docs.qq.com/doc/DSVJ0d094Q1NtbGta

TOP

went

少校

Rank: 6 Rank: 6

帖子: 864
积分: 1738
技术: 229
捐助: 0
注册时间: 2017-9-22

8楼

发表于 2021-10-10 13:10 | 显示全部帖子

回复 21# lxh623

点头像加我微信私聊

https://docs.qq.com/doc/DSVJ0d094Q1NtbGta

TOP

返回列表

[新手上路]批处理新手入门导读	[视频教程]批处理基础视频教程	[视频教程]VBS基础视频教程	[批处理精品]批处理版照片整理器
[批处理精品]纯批处理备份&还原驱动	[批处理精品]CMD命令50条不能说的秘密	[在线下载]第三方命令行工具	[在线帮助]VBScript / JScript 在线参考

[收藏此主题] [关注此主题的新回复]

[通过 QQ、MSN 分享给朋友]

设置关闭

开启【页面动态效果 + 卡片式布局】
关闭【页面动态效果 + 卡片式布局】

开启【代码高亮】
关闭【代码高亮】

代码高亮主题【亮】
代码高亮主题【暗】


	设置关闭开启【页面动态效果 + 卡片式布局】关闭【页面动态效果 + 卡片式布局】开启【代码高亮】关闭【代码高亮】代码高亮主题【亮】代码高亮主题【暗】

[收藏此主题] [关注此主题的新回复]

[通过 QQ、MSN 分享给朋友]

设置 关闭

开启 【页面动态效果 + 卡片式布局】 关闭 【页面动态效果 + 卡片式布局】 开启 【代码高亮】关闭 【代码高亮】代码高亮主题 【亮】代码高亮主题 【暗】

设置关闭

开启【页面动态效果 + 卡片式布局】
关闭【页面动态效果 + 卡片式布局】

开启【代码高亮】
关闭【代码高亮】

代码高亮主题【亮】
代码高亮主题【暗】