代码存为bat,win10/11系统运行,爬取太快会限制- <# :
- cls&echo off&cd /d "%~dp0"&rem bat存为ANSI/GB2312编码
- path %SYSTEMROOT%\System32\WindowsPowerShell\v1.0;%path%
- set "current=%cd%"
- powershell -NoProfile -ExecutionPolicy bypass "Get-Content -literal '%~f0'|Out-String|Invoke-Expression"
- pause
- exit
- #>
- $current=($env:current).trimend('\');
- $rooturl='https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html';
- $outfile=$current+'\结果.csv';
-
- $pyfile=$current+'\py.txt';
- if(-not (test-path -literal $pyfile)){
- Invoke-WebRequest -Uri 'https://csstools.chinaz.com/tools/js/pinyin.js' -OutFile $pyfile;
- }
- $enc=[System.Text.Encoding]::UTF8;
- $text=[IO.File]::ReadAllText($pyfile, $enc);
- $pydic=New-Object 'System.Collections.Generic.Dictionary[string, string]';
- $m=[regex]::match($text, 'var pydic ?= ?"([^"]+)"');
- if($m.success){
- $arr=$m.groups[1].value.trimend(',').split(',');
- for($i=0;$i -lt $arr.length;$i++){
- $han=$arr[$i].Substring(0,1);
- $py=((((($arr[$i].Substring(1) -replace '[āáǎà]','a') -replace '[ōóǒò]','o') -replace '[ēéěè]','e') -replace '[īíǐì]','i') -replace '[ūúǔù]','u') -replace '[üǖǘǚǜ]','v';
- if($py.length -le 1){$py=$py.toUpper();}else{$py=$py.Substring(0,1).toUpper()+$py.Substring(1);}
- $pydic.add($han, $py);
- }
- }
-
-
- $fs=New-Object System.IO.FileStream($outfile, [System.IO.FileMode]::Create);
- $sw=New-Object System.IO.StreamWriter($fs, [Text.Encoding]::GetEncoding('GB2312'));
-
- function hantopy($h, $s){
- $brr=New-Object -TypeName System.Collections.ArrayList;
- for($i=0;$i -lt $h.length;$i++){
- $char=$h.Substring($i, 1);
- if($pydic.ContainsKey($char)){
- if($s -eq 1){
- [void]$brr.add($pydic[$char]);
- }else{
- [void]$brr.add($pydic[$char].Substring(0, 1));
- }
- }else{
- [void]$brr.add($char);
- }
- }
- return ($brr -join '');
- }
-
- function gethtml($u){
- $t='';
- $ua='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0';
- for($j=1;$j -le 10;$j++){
- try{
- $req=Invoke-WebRequest -UseBasicParsing -Uri $u -TimeoutSec 8 -UserAgent $ua;
- if($req.StatusCode -eq 200){
- [byte[]] $bytes=[Text.Encoding]::GetEncoding('ISO-8859-1').GetBytes($req.Content);
- $t=[Text.Encoding]::GetEncoding('UTF-8').GetString($bytes)
- break;
- }
- }catch{
- write-host ('第'+$j.ToString()+'次获取网页内容失败');
- start-sleep -Seconds 10;
- }
- }
- start-sleep -Seconds 5;
- return $t;
- }
-
- function geturl($html, $url){
- $from=$url -replace '[^//]+$','';
- $m1=[regex]::matches($html, '<tr class="(provincetr|citytr|countytr|towntr|villagetr)"[^>]*?>([\s\S]+?)</tr>');
- if($m1.count -ge 1){
- for($i=0;$i -lt $m1.count;$i++){
- $m2=[regex]::matches($m1[$i].groups[2].value, '<td(?: [^>]+?)?>(.+?)</td>');
- if($m2.count -ge 1){
- if($m1[$i].groups[1].value -eq 'provincetr'){
- for($j=0;$j -lt $m2.count;$j++){
- $item=@{
- 'code1'='';
- 'code2'='';
- 'url'='';
- 'name'='';
- 'pinyin1'='';
- 'pinyin2'=''
- };
- $m3=[regex]::match($m2[$j].groups[1].value, '<a href="([^"]+?)"[^>]*?>([\s\S]+?)</a>');
- if($m3.Success){
- $item['url']=$from+$m3.groups[1].value;
- $item['name']=($m3.groups[2].value -replace '<[^>]+?>','').trim();
- $item['pinyin1']=hantopy $item['name'] 1;
- $item['pinyin2']=hantopy $item['name'] 2;
- }
- write-host $item['name'];
- $line="'"+$item['code1']+','+$item['name']+','+$item['pinyin1']+','+$item['pinyin2'];
- $sw.WriteLine($line);
- $sw.Flush();
- if($item['url'] -ne ''){
- geturl (gethtml $item['url']) $item['url'];
- }
- }
- }else{
- $item=@{
- 'code1'='';
- 'code2'='';
- 'url'='';
- 'name'='';
- 'pinyin1'='';
- 'pinyin2'=''
- };
- if($m1[$i].groups[1].value -eq 'villagetr'){
- $item['code1']=($m2[0].groups[1].value -replace '<[^>]+?>','').trim();
- $item['code2']=($m2[1].groups[1].value -replace '<[^>]+?>','').trim();
- $item['name']=($m2[2].groups[1].value -replace '<[^>]+?>','').trim();
- }else{
- $item['code1']=($m2[0].groups[1].value -replace '<[^>]+?>','').trim();
- $item['name']=($m2[1].groups[1].value -replace '<[^>]+?>','').trim();
- $m3=[regex]::match($m2[1].groups[1].value, '<a href="([^"]+?)"[^>]*?>([\s\S]+?)</a>');
- if($m3.Success){
- $item['url']=$from+$m3.groups[1].value;
- }
- }
- $item['pinyin1']=hantopy $item['name'] 1;
- $item['pinyin2']=hantopy $item['name'] 2;
- write-host $item['name'];
- $line="'"+$item['code1']+','+$item['name']+','+$item['pinyin1']+','+$item['pinyin2'];
- $sw.WriteLine($line);
- $sw.Flush();
- if($item['url'] -ne ''){
- if($m1[$i].groups[1].value -notmatch '(towntr|villagetr)'){
- geturl (gethtml $item['url']) $item['url'];
- }
- }
- }
- }
- }
- }
- }
- #$rooturl='https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/11/01/110102.html'
- geturl (gethtml $rooturl) $rooturl;
- $sw.Close();
- $fs.Close();
- exit;
复制代码
|