本帖最后由 pcl_test 于 2019-2-8 11:01 编辑
感谢贴
十分感谢本论坛 由于工作需要 春节假期用批处理和论坛提供的sed、iconv工具编写了一款爬取最高法裁判文书的爬虫脚本 总共1万多份裁判文书都可以在本地进行搜索查询了 虽然只是静态网页爬虫但十分有成就感 今后争取能学会动态网页爬虫技术 哈哈 下面上代码- set url_1=http://www.court.gov.cn/wenshu.html
- echo # target = %url_1%
- curl %url_1%>temp.tmp 2>nul
- iconv -c -f utf-8 -t gbk//ignore temp.tmp >gbk.tmp
- sed -r -i "s/[[:space:]]//g" gbk.tmp
- sed -n -r "/<liclass=\"last\">.*page/p" gbk.tmp >temp.tmp
- sed -r -i "s/.*page.([0-9]+)(\.html)?\".*/\1/" temp.tmp
- set /p pages=<temp.tmp
- sed -n -r "/共收录<font>([0-9]+)<\/font>份/p" gbk.tmp >temp.tmp
- sed -r -i "s/.*共收录<font>([0-9]+)<\/font>份.*/\1/" temp.tmp
- set /p articles=<temp.tmp
- echo # find !pages! pages , !articles! articles
- set /p down_number=# input the number of latest articles to down ^( 1 - !articles! ^) :
- set n=1
- :loop_begin
- if !n! GTR !pages! goto loop_end
- curl %url_1%?page=!n!>temp.tmp 2>nul
- iconv -c -f utf-8 -t gbk//ignore temp.tmp >gbk.tmp
- sed -r -i "s/[[:space:]]//g" gbk.tmp
- sed -n -r -i "/target=\"_blank\"href=\".*html/p" gbk.tmp
- sed -r -i "s/.*target=\"_blank\"href=\"(.*html).*/\1/" gbk.tmp
- type gbk.tmp>>link.bak
- set /a n+=1
- set lines=0
- for /f %%i in (link.bak) do set /a lines+=1
- if !lines! GEQ !down_number! (
- goto loop_end
- ) else (
- goto loop_begin
- )
- :loop_end
- set start_time=!time!
- set n=0
- :down_begin
- set /a down_number-=1
- if !down_number! LSS 0 goto end
- set url=""
- set /p url=<link.bak
- if !url!=="" goto end
- curl http://www.court.gov.cn!url! >temp.tmp 2>nul
- iconv -c -f utf-8 -t gbk//ignore temp.tmp >gbk.tmp
- sed -r -i "s/[[:space:]]//g" gbk.tmp
- set t=""
- sed -n "/<divclass=\"title\">/p" gbk.tmp>title.tmp
- sed -r -i "s/<[^>]*>//g" title.tmp
- sed -i "s/:/:/g" title.tmp
- sed -i "s/(/(/g" title.tmp
- sed -i "s/)/)/g" title.tmp
- set /p t=<title.tmp
- sed -n -i "/<divclass=\"txt_txt\"id=\"zoom\">/,/\[CDATA\[/p" gbk.tmp
- sed -i "s/ //g" gbk.tmp
- sed -r -i "s/<[^>]*>/\n/g" gbk.tmp
- sed -n -i "1,/^二〇.*年.*月.*日/p" gbk.tmp
- sed -r -i "/^$/d" gbk.tmp
- sed -i "s/(/(/g" gbk.tmp
- sed -i "s/)/)/g" gbk.tmp
- sed -n -r "/^([0-9][0-9][0-9][0-9]).*号$/p" gbk.tmp >num.tmp
- set file_number=""
- set /p file_number=<num.tmp
- sed -r -i "s/^(.*)/ \1/" gbk.tmp
- ren gbk.tmp "!file_number! !t!".txt 2>nul
- set /a n+=1
- sed -i "1 d" link.bak
- echo # !n! articles down
- goto down_begin
- :end
- del link.bak
- del *.tmp
- echo # mission start at !start_time! end at !time! , !n! succeed
- pause>nul
复制代码
|