powershell 网页爬虫获取时光网电影数据库
目的: 用powershell获取时光网所有的电影数据库 , 网址格式是 [url]www.movie.mtime.com/[/url]$id/ , 其中$id 为时光网电影的id号 ,唯一标示。最终生成的文件名字是all_database_movie.csv ,里面的格式为 《 ID;片名;更多片名;电影类型;国家地区》如下, [b][color=Red]按照分号分割,用excel打开的时候选中第一列,然后选择“数据->分列->分割符合->分号->完成”可以显示到 每个表格[/color][/b],如图所示:
[color=RoyalBlue]12556;Halloween(1978)万圣节 ;月光光心慌慌+抓鬼节+;恐怖+惊悚+;美国+;
12557;Halloween III: Season of the Witch(1982)万圣节3 ;月光光心慌慌3+鬼节+;悬疑+科幻+恐怖+;美国+;
12558;Halloween: Resurrection(2002)月光光心慌慌之大屠杀 ;战栗On-line+万圣节8:复活+;惊悚+恐怖+;美国+;
12633;Spider-Man(2002)蜘蛛侠 ;蜘蛛人+;动作+冒险+;美国+;
12634;Spider-Man 2(2004)蜘蛛侠2 ;蜘蛛人2+;动作+冒险+;美国+;[/color]
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
使用方法:保存下面的代码为《[b][color=Red]000___mtime_抓取电影网页-选数据-逗号分列.ps1[/color][/b]》, 然后设置以下参数:
$start_id 表示要访问的时光网电影id,1是最小值。
$end_id 表示要访问 mtime 电影 id end , 貌似目前最大的ID = 249736
$web_request_enable 表示是否去访问mtime.com网页 ,因为有时候你可能已经访问过某个特定id的网页, 所以就不需要重复访问。每访问一个网页id,[color=Red][b]会生成一个id.txt文件[/b][/color],里面包含了网页源代码html格式。
$start_id_txt 表示需要解析的 mtime 电影 id start 12000
$end_id_txt 表示需要解析的 mtime 电影 id end 34002
// 由参数设置可以看出,访问mtime 和 解析 id.txt文件是可以独立分开的,提高速度。 [b][color=Red]后面我还会写一篇文章,采用并行处理方法来解析20万个id.txt!
[/color][/b][color=Red]
$start_id = 12345 # mtime 电影 id start
$end_id = 12500 # mtime 电影 id end , 貌似目前最大的ID = 249736
$web_request_enable = 1 # 0: 不访问mtime 1: 访问mtime
$start_id_txt = 12000 # 需要解析的 mtime 电影 id start 12000
$end_id_txt = 34002 # 需要解析的 mtime 电影 id end 34002[/color]
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[code]
$USING_PARAM_ENABLE=0
$Path ="D:\"
$Filter = '*.*'
$_Debug = 1
Write-Host ""
Write-Host "请输入要搜索的文件夹路径:例如 D:\test"
if ($_Debug) {
$Path = "D:\迅雷下载\..torrents"
} else {
$Path = Read-Host
}
#if([String]::IsNullOrEmpty($Path))
if(!(Test-Path $Path))
{
$Path =".\"
Write-Host "您输入文件夹路径不存在,取当前目录查找:Path = "$Path
}
Write-Host ""
Write-Host ""
Write-Host "Path = " $Path
if ($_Debug) {
$MySearchString_tmp = "name\d+:(.*?):name"
} else {
Write-Host "请输入要搜索的字符串:"
$MySearchString_tmp = Read-Host
}
# D:\MyProject\ 指定要递归遍历查找的目录
# *.cs 要查找的文件
# myString 要查找的字符串
$found = 0;
#只显示文件,不显示文件夹
#$fileList = Get-ChildItem -Path $Path -Filter $Filter -Recurse | ?{$_.PsIsContainer -eq $false}| %{$_.FullName}
if($USING_PARAM_ENABLE){
Write-Host "start_id_txt_param = $start_id_txt_param"
Write-Host "end_id_txt_param = $end_id_txt_param"
Write-Host "call_cnt = $call_cnt"
$all_database_movie = ".\all_database_movie---$call_cnt.csv"
$all_database_movie_page_error = ".\all_database_movie_page_error---$call_cnt.csv"
$all_database_movie_page_404 = ".\all_database_movie_page_404---$call_cnt.csv"
} else {
$all_database_movie = ".\all_database_movie.csv"
$all_database_movie_page_error = ".\all_database_movie_page_error.csv"
$all_database_movie_page_404 = ".\all_database_movie_page_404.csv"
}
if((Test-Path $all_database_movie)) {
#Remove-Item $all_database_movie
}
if((Test-Path $all_database_movie_page_error)) {
#Remove-Item $all_database_movie_page_error
}
if((Test-Path $all_database_movie_page_404)) {
#Remove-Item $all_database_movie_page_404
}
$fileList = New-Object -TypeName System.Collections.ArrayList
[color=Red] $start_id = 12345 # mtime 电影 id start
$end_id = 12500 # mtime 电影 id end , 貌似目前最大的ID = 249736[/color]
if($USING_PARAM_ENABLE){
$start_id_txt = $start_id_txt_param # mtime 电影 id start 12000
$end_id_txt = $end_id_txt_param # mtime 电影 id end 34002
} else {
$start_id_txt = 12000 # mtime 电影 id start 12000
$end_id_txt = 34002 # mtime 电影 id end 34002
}
$filter_txt = "*.txt" # 选择要解析的电影文件名
$retry_cnt = 0 # 网页打不开的重新访问counter
$retry_times = 4 # 网页打不开的重新访问 retry 次数
$str_connection = "+" # item 连接符 ,例如: 美国+日本+中国
$str_connection_spaces = ";" # fullname连接符 ,例如: 谍影重重;美国;动作+剧情
$web_request_enable = 1 # 0: 不访问mtime 1: 访问mtime
$database_path="D:\mtime_database"
if ($web_request_enable) {
for ($x=$start_id; $x -lt $end_id; $x++)
{
$retry_cnt=0
Write-Host "Processing : [url]http://movie.mtime.com/[/url]$x/"
if((Test-Path $database_path\$x.txt)) {
continue
}
do {
$webcontent=Invoke-RestMethod -uri [url]http://movie.mtime.com/[/url]$x/
#if($?){ #判断命令是否成功返回, 如果网址不存在或没有网络 则是false
$webcontent| Out-File $database_path\$x.txt
#}else{
#write-host "@@@@@@@@@@@@@@@@@@@@@@@@@@@ ** ** @@@@@@@@@@@@@@@@@@@@@@@@@@@ " -ForegroundColor red
#write-host "@@@@@@@@@@@@@@@@@@ 网址不存在或没有网络 @@@@@@@@@@@@@@@@@@@@@ " -ForegroundColor red
#write-host "@@@@@@@@@@@@@@@@@@@@@@@@@@@ ** ** @@@@@@@@@@@@@@@@@@@@@@@@@@@ " -ForegroundColor red
#write-host "mtime id = $database_path\$x.txt "
#exit
#}
$tmpContent11 = Get-Content $database_path\$x.txt
if($tmpContent11.length -lt 5) {
$retry_cnt +=1
write-host "Retrying.... mtime_id = $x"
cmd /c "choice /t 5 /d y /n 1>nul 2>nul"
}
}# do
while($tmpContent11.length -lt 5 -and $retry_cnt -lt $retry_times) # 虽然有内容,但是内容不完整或者为空,需要重新访问该网页!
}
}# if $web_request_enable
###########################
#### 获取要解析的txt文件
###########################
# $fileList = Get-ChildItem -Path $database_path -Filter $filter_txt -Recurse |sort | ?{$_.PsIsContainer -eq $false}| %{$_.FullName}
for ($x=$start_id_txt; $x -lt $end_id_txt; $x++)
{
if((Test-Path $database_path\$x.txt)) {
$singlefile = Get-ChildItem -Path $database_path -Filter "$x.txt" | ?{$_.PsIsContainer -eq $false}| %{$_.FullName}
write-host "Add file -> $database_path\$x.txt"
$fileList.Add($singlefile)
}
}
##############################################
#### Main Process :
##############################################
$error_page_404=0 # 很抱歉,你要访问的页面不存在
$movie_db_txt="" #写入txt文件
$movie_name="<title>(.*?)</title>" #电影名称
$movie_name_regexp="(.*?)([a-z|A-Z].*)" #电影名称正则匹配, 匹配的是英文,如果 是韩语或者日文,就不进行 rename
$movie_name_regexp_match=0 #电影名称正则匹配 match, 结合上面的变量一起使用
$movie_name_rename_not="" #电影名称rename
$movie_name_rename="" #电影名称rename
$movie_name_v_genre=":genre" #电影类型: 剧情,喜剧,恐怖,惊悚等
$movie_name_v_genre_regexp=".*genre.*" #电影类型: 剧情,喜剧,恐怖,惊悚等
$movie_names_more="更多片名" #电影更多片名
$movie_names_more_each="<span>(.*?)</span>" #电影更多片名正则匹配,例如:<span>圣诞快递 3D</span>
$movie_names_more_flag=0 #电影更多片名 begin flag
$movie_names_more_flag_end=0 #电影更多片名 end flag
$movie_names_nation="国家地区" #电影国家地区
$movie_names_nation_each="_blank.*?>(.*?)</a>" #电影国家地区正则匹配,例如:<a href="http://movie.mtime.com/movie/search/section/?nation=Australia" target="_blank">澳大利亚</a>
$movie_names_nation_flag=0 #电影国家地区 begin flag
$movie_names_nation_flag_end=0 #电影国家地区 end flag
$spaces_6=" "
#write-host $filelist
#****************************************************
#** 获取电影名称 国别 类型 更多片名 等信息
#****************************************************
Foreach($file in $fileList)
{
#write-host "file_name=" $file
$tmpContent = Get-Content $file
$error_page_404=0
$movie_total_name = ""
$movie_total_genre=""
$movie_total_nation=""
$movie_name_regexp_match=0
$movie_names_more_flag=0 #电影更多片名 begin flag
$movie_names_more_flag_end=0 #电影更多片名 end flag
$movie_names_nation_flag=0 #电影国家地区 begin flag
$movie_names_nation_flag_end=0 #电影国家地区 end flag
$file_id=0
$file_id_already_exist=0
$file_id_already_exist_error=0
$file_id_already_exist_404=0
if($file -match ".*\\(\d+).txt") {$file_id=$matches[1]} # 获取 id
write-host "################################################################# " -ForegroundColor black -background red
Write-Host "Analyzing : $file ,id = $file_id"
#判断该id是否已经写入 database.csv ,避免重复解析文件
$tmpID = Get-Content $all_database_movie
for ($iii=0; $iii -le $tmpID.length; $iii++){
if($tmpID[$iii] -match "$file_id.*?;") {
$file_id_already_exist = 1
}#else { }
}
if($file_id_already_exist -eq 1) {
Write-Host "Analyzing : $file ,id = $file_id already exist in $all_database_movie, will continue next id file."
continue
}
if($tmpContent.length -lt 5){
write-host "################################################################# " -ForegroundColor red
write-host "############# 内容不完整或者为空,需要重新访问该网页 ######### " -ForegroundColor red
write-host "############# mtime id = $file ######### " -ForegroundColor red
write-host "################################################################# " -ForegroundColor red
$tmpIDerror = Get-Content $all_database_movie_page_error
for ($ii=0; $ii -le $tmpIDerror.length; $ii++){
if($tmpIDerror[$ii] -match "$file_id") {
$file_id_already_exist_error = 1
}#else { }
}
if( $file_id_already_exist_error -eq 0) {
"$file_id" | Out-File -Append $all_database_movie_page_error
break
}
}
for ($i=0; $i -le $tmpContent.length; $i++)
{
#****************************************************
#** (0) 对不起, 你要访问的页面不存在
#****************************************************
if($tmpContent[$i] -match "TIMEOUT") { #貌似不能匹配中文,
$error_page_404=1
write-host "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ " -ForegroundColor yellow
write-host "@@@@@@@@@@@@@@ 对不起, 你要访问的页面不存在 @@@@@@@@@@ " -ForegroundColor yellow
write-host "@@@@@@@@@@@@@@ mtime id = $file @@@@@@@@@@ " -ForegroundColor red
write-host "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ " -ForegroundColor yellow
$tmpID404 = Get-Content $all_database_movie_page_404
for ($k=0; $k -le $tmpID404.length; $k++){
if($tmpID404[$k] -match "$file_id") {
$file_id_already_exist_404 = 1
}#else { }
}
if( $file_id_already_exist_404 -eq 0) {
"$file_id" | Out-File -Append $all_database_movie_page_404
break
}
}
if($error_page_404 -eq 0){
#****************************************************
#** (0) 获取电影类型 : 剧情 科幻等
#****************************************************
if($tmpContent[$i] -match $movie_name_v_genre_regexp)
{
#write-host ""
write-host $spaces_6 $file : " $i " :
#write-host $file " : " $i " : " $tmpContent[$i] -ForegroundColor yellow -background black
write-host $spaces_6 "---------------------------------------"
#write-host "movie_name_v_genre_regexp ="$matches[0]
$tmp=$matches[0]
$tmp1 = $tmp -replace $movie_name_v_genre,"`n"
$tmp2 = $tmp1.split("`n")
#write-host "tmp1 ="$tmp1
#$tmp1 | Out-File tmpt
#$tmptContent = Get-Content tmpt
for ($j=0; $j -le $tmp2.length; $j++) {
if($tmp2[$j] -match ".*?>(.*?)</a"){
#write-host $matches[0]
#write-host "movie_name_v_genre ="$matches[1]
$movie_total_genre += $matches[1] + $str_connection
}
}
write-host $spaces_6 "---------------------------------------"
$found = 1;
}
# #****************************************************
# #** (1) 获取电影名称
# #****************************************************
if($tmpContent[$i] -match $movie_name)
{
#有一些电影名字里有这种特殊字符,处理掉!
$name_tmp=$tmpContent[$i] -replace '"',""
$name_tmp_0=$name_tmp -replace '·'," "
$name_tmp_1=$name_tmp_0 -replace '#183;'," "
$name_tmp_2=$name_tmp_1 -replace '½'," "
$name_tmp_3=$name_tmp_2 -replace 'ü'," "
$name_tmp_4=$name_tmp_3 -replace '&#228;',"ä"
if($name_tmp_4 -match $movie_name) {
#write-host ""
write-host $spaces_6 $file : " $i " :
#write-host $file " : " $i " : " $tmpContent[$i] -ForegroundColor yellow -background black
write-host $spaces_6 "---------------------------------------"
write-host $spaces_6 $matches[0]、
$movie_name_rename_not = $matches[1]
write-host $spaces_6 "movie_name ="$movie_name_rename_not
$tmp=$matches[1]
if($tmp -match $movie_name_regexp){
#write-host $matches[0]
#write-host "movie_name_rename ="$matches[1]
#write-host "movie_name_rename ="$matches[2]
$movie_name_rename=$matches[2]+$matches[1]
write-host $spaces_6 "movie_name_rename ="$movie_name_rename
$movie_name_regexp_match = 1
}
write-host $spaces_6 "---------------------------------------"
$found = 1;
}
}
#****************************************************
#** (2) 获取国家地区
#****************************************************
if($movie_names_nation_flag -eq 1) {
if($tmpContent[$i] -match "</dd>")
{
$movie_names_nation_flag_end=1
$movie_names_nation_flag = 0
}
}
#if($movie_names_nation_flag_end -eq 1) {continue}
if($movie_names_nation_flag -eq 0) {
if($tmpContent[$i] -match $movie_names_nation)
{
#write-host ""
$movie_names_nation_flag=1
write-host $spaces_6 $file : " $i " :
#write-host $file " : " $i " : " $tmpContent[$i] -ForegroundColor yellow -background black
write-host $spaces_6 "---------------------------------------"
write-host $spaces_6 $matches[0]
write-host $spaces_6 "movie_names_nation ="$matches[1]
write-host $spaces_6 "---------------------------------------"
$found = 1;
}
} elseif($movie_names_nation_flag -eq 1 -and $movie_names_nation_flag_end -eq 0) { # processing ...
if($tmpContent[$i] -match $movie_names_nation_each)
{
#write-host ""
$movie_names_nation_flag=1
write-host $spaces_6 $file : " $i " :
#write-host $file " : " $i " : " $tmpContent[$i] -ForegroundColor yellow -background black
write-host $spaces_6 "----------------------------------------" -ForegroundColor yellow #-background black
write-host $spaces_6 $matches[0]
write-host $spaces_6 "movie_names_nation_each ="$matches[1] -ForegroundColor yellow #-background black
$movie_total_nation += $matches[1] + $str_connection
write-host $spaces_6 "---------------------------------------" -ForegroundColor yellow #-background black
$found = 1;
}
} elseif($movie_names_nation_flag -eq 1 -and $movie_names_nation_flag_end -eq 1) { # end processing...
}
#****************************************************
#** (3) 获取电影别名, 更多片名,更多译名
#****************************************************
if($movie_names_more_flag -eq 1) {
if($tmpContent[$i] -match "</dd>")
{
$movie_names_more_flag_end=1
$movie_names_more_flag=0
}
}
if($movie_names_more_flag_end -eq 1) {continue}
if($movie_names_more_flag -eq 0) {
if($tmpContent[$i] -match $movie_names_more)
{
#write-host ""
$movie_names_more_flag=1
write-host $spaces_6 $file : " $i " :
#write-host $file " : " $i " : " $tmpContent[$i] -ForegroundColor yellow -background black
write-host $spaces_6 "---------------------------------------"
write-host $spaces_6 $matches[0]
write-host $spaces_6 "movie_names_more ="$matches[1]
write-host $spaces_6 "---------------------------------------"
$found = 1;
}
} else {
if($tmpContent[$i] -match $movie_names_more_each)
{
#write-host ""
$name_each_tmp=$tmpContent[$i] -replace '·'," "
$name_each_tmp_0=$name_each_tmp -replace '&'," "
$name_each_tmp_1=$name_each_tmp_0 -replace '#183;'," "
$name_each_tmp_2=$name_each_tmp_1 -replace '½'," "
$name_each_tmp_3=$name_each_tmp_2 -replace 'ü'," "
$name_each_tmp_4=$name_each_tmp_3 -replace '&#228;',"ä"
$movie_names_more_flag=1
write-host $spaces_6 $file : " $i " :
if($name_each_tmp_4 -match $movie_names_more_each){
#write-host $file " : " $i " : " $tmpContent[$i] -ForegroundColor yellow -background black
write-host $spaces_6 "---------------------------------------" -ForegroundColor yellow #-background black
write-host $spaces_6 $matches[0]
write-host $spaces_6 "movie_names_more_each ="$matches[1] -ForegroundColor yellow #-background black
$movie_total_name += $matches[1] + $str_connection
write-host $spaces_6 "---------------------------------------" -ForegroundColor yellow #-background black
$found = 1;
}
}
}
} # error_page_404
##########################################################
##########################################################
##########################################################
} #for
if($error_page_404 -eq 0){
$movie_db_txt="" #write results to txt file
write-host "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" -ForegroundColor green
write-host "&&&&& movie_id ="$file -ForegroundColor green
$movie_db_txt = $movie_db_txt + $file_id + $str_connection_spaces
if($movie_name_regexp_match -eq 0) {
write-host "&&&&& movie_name_rename_not ="$movie_name_rename_not -ForegroundColor green
$movie_db_txt += $movie_name_rename_not + $str_connection_spaces
}else{
write-host "&&&&& movie_name_rename ="$movie_name_rename -ForegroundColor green
$movie_db_txt += $movie_name_rename + $str_connection_spaces
}
write-host "&&&&& movie_total_name ="$movie_total_name -ForegroundColor green
write-host "&&&&& movie_total_genre ="$movie_total_genre -ForegroundColor green
write-host "&&&&& movie_total_nation ="$movie_total_nation -ForegroundColor green
$movie_db_txt = $movie_db_txt + $movie_total_name + $str_connection_spaces
$movie_db_txt = $movie_db_txt + $movie_total_genre + $str_connection_spaces
$movie_db_txt = $movie_db_txt + $movie_total_nation + $str_connection_spaces
$movie_db_txt |Out-File -append $all_database_movie
write-host "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" -ForegroundColor green
}
}#foreach each_movie.txt
write-host "------------------ script end! --------------"
if ($found -eq 0)
{
write-host "没有找到您查找的字符串,请重新输入吧,O(∩_∩)O~ " -ForegroundColor red -background black
}
# for ($x=0; $x -lt $ar.Count; $x++) { write-host "ar["$x"]" = $ar[$x] }
cmd /c "pause"
# Start-Sleep -s 10[/code] [i=s] 本帖最后由 523066680 于 2018-2-1 15:33 编辑 [/i]
bt天堂的资料挺齐全,抓过封面和磁力。主目录按评分,次级目录按类型
[img]http://imgout.ph.126.net/59930019/bt.jpg[/img]
现在看看豆瓣影评的分类和评分索引也都做的很好,抓起来好分类。 [b]回复 [url=http://www.bathome.net/redirect.php?goto=findpost&pid=206459&ptid=47265]2#[/url] [i]523066680[/i] [/b]
版主大佬,能分享一下吗? 谢谢了,再写一遍浪费时间了,哈哈,偷个懒。。。 [b]回复 [url=http://bbs.bathome.net/redirect.php?goto=findpost&pid=206600&ptid=47265]3#[/url] [i]gflrlm[/i] [/b]
已经回老家,节后分享
页:
[1]