标题: [原创代码] powershell 网页爬虫获取时光网电影数据库 [打印本页]
作者: gflrlm 时间: 2018-2-1 12:01 标题: powershell 网页爬虫获取时光网电影数据库
目的: 用powershell获取时光网所有的电影数据库 , 网址格式是 www.movie.mtime.com/$id/ , 其中$id 为时光网电影的id号 ,唯一标示。
最终生成的文件名字是all_database_movie.csv ,里面的格式为 《 ID;片名;更多片名;电影类型;国家地区》如下, 按照分号分割,用excel打开的时候选中第一列,然后选择“数据->分列->分割符合->分号->完成”可以显示到 每个表格,如图所示:
12556;Halloween(1978)万圣节 ;月光光心慌慌+抓鬼节+;恐怖+惊悚+;美国+;
12557;Halloween III: Season of the Witch(1982)万圣节3 ;月光光心慌慌3+鬼节+;悬疑+科幻+恐怖+;美国+;
12558;Halloween: Resurrection(2002)月光光心慌慌之大屠杀 ;战栗On-line+万圣节8:复活+;惊悚+恐怖+;美国+;
12633;Spider-Man(2002)蜘蛛侠 ;蜘蛛人+;动作+冒险+;美国+;
12634;Spider-Man 2(2004)蜘蛛侠2 ;蜘蛛人2+;动作+冒险+;美国+;
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
使用方法:保存下面的代码为《000___mtime_抓取电影网页-选数据-逗号分列.ps1》, 然后设置以下参数:
$start_id 表示要访问的时光网电影id,1是最小值。
$end_id 表示要访问 mtime 电影 id end , 貌似目前最大的ID = 249736
$web_request_enable 表示是否去访问mtime.com网页 ,因为有时候你可能已经访问过某个特定id的网页, 所以就不需要重复访问。每访问一个网页id,会生成一个id.txt文件,里面包含了网页源代码html格式。
$start_id_txt 表示需要解析的 mtime 电影 id start 12000
$end_id_txt 表示需要解析的 mtime 电影 id end 34002
// 由参数设置可以看出,访问mtime 和 解析 id.txt文件是可以独立分开的,提高速度。 后面我还会写一篇文章,采用并行处理方法来解析20万个id.txt!
$start_id = 12345 # mtime 电影 id start
$end_id = 12500 # mtime 电影 id end , 貌似目前最大的ID = 249736
$web_request_enable = 1 # 0: 不访问mtime 1: 访问mtime
$start_id_txt = 12000 # 需要解析的 mtime 电影 id start 12000
$end_id_txt = 34002 # 需要解析的 mtime 电影 id end 34002
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- $USING_PARAM_ENABLE=0
-
-
- $Path ="D:\"
- $Filter = '*.*'
- $_Debug = 1
-
- Write-Host ""
- Write-Host "请输入要搜索的文件夹路径:例如 D:\test"
- if ($_Debug) {
- $Path = "D:\迅雷下载\..torrents"
- } else {
- $Path = Read-Host
- }
- #if([String]::IsNullOrEmpty($Path))
-
- if(!(Test-Path $Path))
- {
- $Path =".\"
- Write-Host "您输入文件夹路径不存在,取当前目录查找:Path = "$Path
- }
-
- Write-Host ""
- Write-Host ""
- Write-Host "Path = " $Path
-
- if ($_Debug) {
- $MySearchString_tmp = "name\d+:(.*?):name"
- } else {
- Write-Host "请输入要搜索的字符串:"
- $MySearchString_tmp = Read-Host
- }
-
-
- # D:\MyProject\ 指定要递归遍历查找的目录
- # *.cs 要查找的文件
- # myString 要查找的字符串
- $found = 0;
- #只显示文件,不显示文件夹
- #$fileList = Get-ChildItem -Path $Path -Filter $Filter -Recurse | ?{$_.PsIsContainer -eq $false}| %{$_.FullName}
-
-
-
- if($USING_PARAM_ENABLE){
- Write-Host "start_id_txt_param = $start_id_txt_param"
- Write-Host "end_id_txt_param = $end_id_txt_param"
- Write-Host "call_cnt = $call_cnt"
- $all_database_movie = ".\all_database_movie---$call_cnt.csv"
- $all_database_movie_page_error = ".\all_database_movie_page_error---$call_cnt.csv"
- $all_database_movie_page_404 = ".\all_database_movie_page_404---$call_cnt.csv"
- } else {
- $all_database_movie = ".\all_database_movie.csv"
- $all_database_movie_page_error = ".\all_database_movie_page_error.csv"
- $all_database_movie_page_404 = ".\all_database_movie_page_404.csv"
- }
- if((Test-Path $all_database_movie)) {
- #Remove-Item $all_database_movie
- }
- if((Test-Path $all_database_movie_page_error)) {
- #Remove-Item $all_database_movie_page_error
- }
- if((Test-Path $all_database_movie_page_404)) {
- #Remove-Item $all_database_movie_page_404
- }
-
- $fileList = New-Object -TypeName System.Collections.ArrayList
-
- [color=Red] $start_id = 12345 # mtime 电影 id start
- $end_id = 12500 # mtime 电影 id end , 貌似目前最大的ID = 249736[/color]
- if($USING_PARAM_ENABLE){
- $start_id_txt = $start_id_txt_param # mtime 电影 id start 12000
- $end_id_txt = $end_id_txt_param # mtime 电影 id end 34002
- } else {
- $start_id_txt = 12000 # mtime 电影 id start 12000
- $end_id_txt = 34002 # mtime 电影 id end 34002
- }
- $filter_txt = "*.txt" # 选择要解析的电影文件名
- $retry_cnt = 0 # 网页打不开的重新访问counter
- $retry_times = 4 # 网页打不开的重新访问 retry 次数
- $str_connection = "+" # item 连接符 ,例如: 美国+日本+中国
- $str_connection_spaces = ";" # fullname连接符 ,例如: 谍影重重;美国;动作+剧情
- $web_request_enable = 1 # 0: 不访问mtime 1: 访问mtime
- $database_path="D:\mtime_database"
- if ($web_request_enable) {
- for ($x=$start_id; $x -lt $end_id; $x++)
- {
- $retry_cnt=0
- Write-Host "Processing : [url]http://movie.mtime.com/[/url]$x/"
- if((Test-Path $database_path\$x.txt)) {
- continue
- }
- do {
- $webcontent=Invoke-RestMethod -uri [url]http://movie.mtime.com/[/url]$x/
- #if($?){ #判断命令是否成功返回, 如果网址不存在或没有网络 则是false
- $webcontent| Out-File $database_path\$x.txt
-
- #}else{
- #write-host "@@@@@@@@@@@@@@@@@@@@@@@@@@@ ** ** @@@@@@@@@@@@@@@@@@@@@@@@@@@ " -ForegroundColor red
- #write-host "@@@@@@@@@@@@@@@@@@ 网址不存在或没有网络 @@@@@@@@@@@@@@@@@@@@@ " -ForegroundColor red
- #write-host "@@@@@@@@@@@@@@@@@@@@@@@@@@@ ** ** @@@@@@@@@@@@@@@@@@@@@@@@@@@ " -ForegroundColor red
- #write-host "mtime id = $database_path\$x.txt "
- #exit
- #}
- $tmpContent11 = Get-Content $database_path\$x.txt
- if($tmpContent11.length -lt 5) {
- $retry_cnt +=1
- write-host "Retrying.... mtime_id = $x"
- cmd /c "choice /t 5 /d y /n 1>nul 2>nul"
- }
- }# do
- while($tmpContent11.length -lt 5 -and $retry_cnt -lt $retry_times) # 虽然有内容,但是内容不完整或者为空,需要重新访问该网页!
-
- }
- }# if $web_request_enable
-
- ###########################
- #### 获取要解析的txt文件
- ###########################
- # $fileList = Get-ChildItem -Path $database_path -Filter $filter_txt -Recurse |sort | ?{$_.PsIsContainer -eq $false}| %{$_.FullName}
- for ($x=$start_id_txt; $x -lt $end_id_txt; $x++)
- {
- if((Test-Path $database_path\$x.txt)) {
- $singlefile = Get-ChildItem -Path $database_path -Filter "$x.txt" | ?{$_.PsIsContainer -eq $false}| %{$_.FullName}
- write-host "Add file -> $database_path\$x.txt"
- $fileList.Add($singlefile)
- }
- }
-
-
- ##############################################
- #### Main Process :
- ##############################################
- $error_page_404=0 # 很抱歉,你要访问的页面不存在
-
-
- $movie_db_txt="" #写入txt文件
- $movie_name="<title>(.*?)</title>" #电影名称
- $movie_name_regexp="(.*?)([a-z|A-Z].*)" #电影名称正则匹配, 匹配的是英文,如果 是韩语或者日文,就不进行 rename
- $movie_name_regexp_match=0 #电影名称正则匹配 match, 结合上面的变量一起使用
- $movie_name_rename_not="" #电影名称rename
- $movie_name_rename="" #电影名称rename
- $movie_name_v_genre=":genre" #电影类型: 剧情,喜剧,恐怖,惊悚等
- $movie_name_v_genre_regexp=".*genre.*" #电影类型: 剧情,喜剧,恐怖,惊悚等
-
- $movie_names_more="更多片名" #电影更多片名
- $movie_names_more_each="<span>(.*?)</span>" #电影更多片名正则匹配,例如:<span>圣诞快递 3D</span>
- $movie_names_more_flag=0 #电影更多片名 begin flag
- $movie_names_more_flag_end=0 #电影更多片名 end flag
-
-
- $movie_names_nation="国家地区" #电影国家地区
- $movie_names_nation_each="_blank.*?>(.*?)</a>" #电影国家地区正则匹配,例如:<a href="http://movie.mtime.com/movie/search/section/?nation=Australia" target="_blank">澳大利亚</a>
- $movie_names_nation_flag=0 #电影国家地区 begin flag
- $movie_names_nation_flag_end=0 #电影国家地区 end flag
-
- $spaces_6=" "
- #write-host $filelist
-
- #****************************************************
- #** 获取电影名称 国别 类型 更多片名 等信息
- #****************************************************
- Foreach($file in $fileList)
- {
- #write-host "file_name=" $file
-
- $tmpContent = Get-Content $file
- $error_page_404=0
- $movie_total_name = ""
- $movie_total_genre=""
- $movie_total_nation=""
- $movie_name_regexp_match=0
-
- $movie_names_more_flag=0 #电影更多片名 begin flag
- $movie_names_more_flag_end=0 #电影更多片名 end flag
- $movie_names_nation_flag=0 #电影国家地区 begin flag
- $movie_names_nation_flag_end=0 #电影国家地区 end flag
- $file_id=0
- $file_id_already_exist=0
- $file_id_already_exist_error=0
- $file_id_already_exist_404=0
- if($file -match ".*\\(\d+).txt") {$file_id=$matches[1]} # 获取 id
- write-host "################################################################# " -ForegroundColor black -background red
- Write-Host "Analyzing : $file ,id = $file_id"
- #判断该id是否已经写入 database.csv ,避免重复解析文件
- $tmpID = Get-Content $all_database_movie
- for ($iii=0; $iii -le $tmpID.length; $iii++){
- if($tmpID[$iii] -match "$file_id.*?;") {
- $file_id_already_exist = 1
- }#else { }
- }
- if($file_id_already_exist -eq 1) {
- Write-Host "Analyzing : $file ,id = $file_id already exist in $all_database_movie, will continue next id file."
- continue
- }
- if($tmpContent.length -lt 5){
- write-host "################################################################# " -ForegroundColor red
- write-host "############# 内容不完整或者为空,需要重新访问该网页 ######### " -ForegroundColor red
- write-host "############# mtime id = $file ######### " -ForegroundColor red
- write-host "################################################################# " -ForegroundColor red
- $tmpIDerror = Get-Content $all_database_movie_page_error
- for ($ii=0; $ii -le $tmpIDerror.length; $ii++){
- if($tmpIDerror[$ii] -match "$file_id") {
- $file_id_already_exist_error = 1
- }#else { }
- }
- if( $file_id_already_exist_error -eq 0) {
- "$file_id" | Out-File -Append $all_database_movie_page_error
- break
- }
- }
- for ($i=0; $i -le $tmpContent.length; $i++)
- {
- #****************************************************
- #** (0) 对不起, 你要访问的页面不存在
- #****************************************************
- if($tmpContent[$i] -match "TIMEOUT") { #貌似不能匹配中文,
- $error_page_404=1
- write-host "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ " -ForegroundColor yellow
- write-host "@@@@@@@@@@@@@@ 对不起, 你要访问的页面不存在 @@@@@@@@@@ " -ForegroundColor yellow
- write-host "@@@@@@@@@@@@@@ mtime id = $file @@@@@@@@@@ " -ForegroundColor red
- write-host "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ " -ForegroundColor yellow
- $tmpID404 = Get-Content $all_database_movie_page_404
- for ($k=0; $k -le $tmpID404.length; $k++){
- if($tmpID404[$k] -match "$file_id") {
- $file_id_already_exist_404 = 1
- }#else { }
- }
- if( $file_id_already_exist_404 -eq 0) {
- "$file_id" | Out-File -Append $all_database_movie_page_404
- break
- }
- }
- if($error_page_404 -eq 0){
- #****************************************************
- #** (0) 获取电影类型 : 剧情 科幻等
- #****************************************************
- if($tmpContent[$i] -match $movie_name_v_genre_regexp)
- {
- #write-host ""
- write-host $spaces_6 $file : " $i " :
- #write-host $file " : " $i " : " $tmpContent[$i] -ForegroundColor yellow -background black
- write-host $spaces_6 "---------------------------------------"
- #write-host "movie_name_v_genre_regexp ="$matches[0]
- $tmp=$matches[0]
- $tmp1 = $tmp -replace $movie_name_v_genre,"`n"
- $tmp2 = $tmp1.split("`n")
- #write-host "tmp1 ="$tmp1
- #$tmp1 | Out-File tmpt
- #$tmptContent = Get-Content tmpt
- for ($j=0; $j -le $tmp2.length; $j++) {
- if($tmp2[$j] -match ".*?>(.*?)</a"){
- #write-host $matches[0]
- #write-host "movie_name_v_genre ="$matches[1]
- $movie_total_genre += $matches[1] + $str_connection
- }
- }
- write-host $spaces_6 "---------------------------------------"
- $found = 1;
- }
-
- # #****************************************************
- # #** (1) 获取电影名称
- # #****************************************************
-
- if($tmpContent[$i] -match $movie_name)
- {
-
-
- #有一些电影名字里有这种特殊字符,处理掉!
- $name_tmp=$tmpContent[$i] -replace '"',""
- $name_tmp_0=$name_tmp -replace '·'," "
- $name_tmp_1=$name_tmp_0 -replace '#183;'," "
- $name_tmp_2=$name_tmp_1 -replace '½'," "
- $name_tmp_3=$name_tmp_2 -replace 'ü'," "
- $name_tmp_4=$name_tmp_3 -replace '&#228;',"ä"
- if($name_tmp_4 -match $movie_name) {
- #write-host ""
- write-host $spaces_6 $file : " $i " :
- #write-host $file " : " $i " : " $tmpContent[$i] -ForegroundColor yellow -background black
- write-host $spaces_6 "---------------------------------------"
- write-host $spaces_6 $matches[0]、
- $movie_name_rename_not = $matches[1]
- write-host $spaces_6 "movie_name ="$movie_name_rename_not
- $tmp=$matches[1]
- if($tmp -match $movie_name_regexp){
- #write-host $matches[0]
- #write-host "movie_name_rename ="$matches[1]
- #write-host "movie_name_rename ="$matches[2]
- $movie_name_rename=$matches[2]+$matches[1]
- write-host $spaces_6 "movie_name_rename ="$movie_name_rename
- $movie_name_regexp_match = 1
- }
- write-host $spaces_6 "---------------------------------------"
- $found = 1;
- }
- }
-
- #****************************************************
- #** (2) 获取国家地区
- #****************************************************
-
- if($movie_names_nation_flag -eq 1) {
- if($tmpContent[$i] -match "</dd>")
- {
- $movie_names_nation_flag_end=1
- $movie_names_nation_flag = 0
- }
- }
- #if($movie_names_nation_flag_end -eq 1) {continue}
- if($movie_names_nation_flag -eq 0) {
- if($tmpContent[$i] -match $movie_names_nation)
- {
- #write-host ""
- $movie_names_nation_flag=1
- write-host $spaces_6 $file : " $i " :
- #write-host $file " : " $i " : " $tmpContent[$i] -ForegroundColor yellow -background black
- write-host $spaces_6 "---------------------------------------"
- write-host $spaces_6 $matches[0]
- write-host $spaces_6 "movie_names_nation ="$matches[1]
- write-host $spaces_6 "---------------------------------------"
- $found = 1;
- }
- } elseif($movie_names_nation_flag -eq 1 -and $movie_names_nation_flag_end -eq 0) { # processing ...
- if($tmpContent[$i] -match $movie_names_nation_each)
- {
- #write-host ""
- $movie_names_nation_flag=1
- write-host $spaces_6 $file : " $i " :
- #write-host $file " : " $i " : " $tmpContent[$i] -ForegroundColor yellow -background black
- write-host $spaces_6 "----------------------------------------" -ForegroundColor yellow #-background black
- write-host $spaces_6 $matches[0]
- write-host $spaces_6 "movie_names_nation_each ="$matches[1] -ForegroundColor yellow #-background black
- $movie_total_nation += $matches[1] + $str_connection
- write-host $spaces_6 "---------------------------------------" -ForegroundColor yellow #-background black
- $found = 1;
- }
- } elseif($movie_names_nation_flag -eq 1 -and $movie_names_nation_flag_end -eq 1) { # end processing...
- }
-
-
- #****************************************************
- #** (3) 获取电影别名, 更多片名,更多译名
- #****************************************************
-
- if($movie_names_more_flag -eq 1) {
- if($tmpContent[$i] -match "</dd>")
- {
- $movie_names_more_flag_end=1
- $movie_names_more_flag=0
- }
- }
- if($movie_names_more_flag_end -eq 1) {continue}
- if($movie_names_more_flag -eq 0) {
- if($tmpContent[$i] -match $movie_names_more)
- {
- #write-host ""
- $movie_names_more_flag=1
- write-host $spaces_6 $file : " $i " :
- #write-host $file " : " $i " : " $tmpContent[$i] -ForegroundColor yellow -background black
- write-host $spaces_6 "---------------------------------------"
- write-host $spaces_6 $matches[0]
- write-host $spaces_6 "movie_names_more ="$matches[1]
- write-host $spaces_6 "---------------------------------------"
- $found = 1;
- }
- } else {
- if($tmpContent[$i] -match $movie_names_more_each)
- {
- #write-host ""
- $name_each_tmp=$tmpContent[$i] -replace '·'," "
- $name_each_tmp_0=$name_each_tmp -replace '&'," "
- $name_each_tmp_1=$name_each_tmp_0 -replace '#183;'," "
- $name_each_tmp_2=$name_each_tmp_1 -replace '½'," "
- $name_each_tmp_3=$name_each_tmp_2 -replace 'ü'," "
- $name_each_tmp_4=$name_each_tmp_3 -replace '&#228;',"ä"
-
-
- $movie_names_more_flag=1
- write-host $spaces_6 $file : " $i " :
- if($name_each_tmp_4 -match $movie_names_more_each){
- #write-host $file " : " $i " : " $tmpContent[$i] -ForegroundColor yellow -background black
- write-host $spaces_6 "---------------------------------------" -ForegroundColor yellow #-background black
- write-host $spaces_6 $matches[0]
- write-host $spaces_6 "movie_names_more_each ="$matches[1] -ForegroundColor yellow #-background black
- $movie_total_name += $matches[1] + $str_connection
- write-host $spaces_6 "---------------------------------------" -ForegroundColor yellow #-background black
- $found = 1;
- }
- }
- }
- } # error_page_404
-
-
- ##########################################################
- ##########################################################
- ##########################################################
-
- } #for
- if($error_page_404 -eq 0){
- $movie_db_txt="" #write results to txt file
- write-host "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" -ForegroundColor green
- write-host "&&&&& movie_id ="$file -ForegroundColor green
- $movie_db_txt = $movie_db_txt + $file_id + $str_connection_spaces
- if($movie_name_regexp_match -eq 0) {
- write-host "&&&&& movie_name_rename_not ="$movie_name_rename_not -ForegroundColor green
- $movie_db_txt += $movie_name_rename_not + $str_connection_spaces
- }else{
- write-host "&&&&& movie_name_rename ="$movie_name_rename -ForegroundColor green
- $movie_db_txt += $movie_name_rename + $str_connection_spaces
- }
- write-host "&&&&& movie_total_name ="$movie_total_name -ForegroundColor green
- write-host "&&&&& movie_total_genre ="$movie_total_genre -ForegroundColor green
- write-host "&&&&& movie_total_nation ="$movie_total_nation -ForegroundColor green
- $movie_db_txt = $movie_db_txt + $movie_total_name + $str_connection_spaces
- $movie_db_txt = $movie_db_txt + $movie_total_genre + $str_connection_spaces
- $movie_db_txt = $movie_db_txt + $movie_total_nation + $str_connection_spaces
- $movie_db_txt |Out-File -append $all_database_movie
- write-host "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" -ForegroundColor green
- }
- }#foreach each_movie.txt
-
-
- write-host "------------------ script end! --------------"
- if ($found -eq 0)
- {
- write-host "没有找到您查找的字符串,请重新输入吧,O(∩_∩)O~ " -ForegroundColor red -background black
- }
-
- # for ($x=0; $x -lt $ar.Count; $x++) { write-host "ar["$x"]" = $ar[$x] }
-
-
-
-
- cmd /c "pause"
- # Start-Sleep -s 10
复制代码
作者: 523066680 时间: 2018-2-1 15:27
本帖最后由 523066680 于 2018-2-1 15:33 编辑
bt天堂的资料挺齐全,抓过封面和磁力。主目录按评分,次级目录按类型
现在看看豆瓣影评的分类和评分索引也都做的很好,抓起来好分类。
作者: gflrlm 时间: 2018-2-8 22:37
回复 2# 523066680
版主大佬,能分享一下吗? 谢谢了,再写一遍浪费时间了,哈哈,偷个懒。。。
作者: 523066680 时间: 2018-2-8 22:57
回复 3# gflrlm
已经回老家,节后分享
欢迎光临 批处理之家 (http://bbs.bathome.net/) |
Powered by Discuz! 7.2 |