本帖最后由 flashercs 于 2019-11-9 11:08 编辑
- <#*,:&cls
- @echo off
- pushd "%~dp0"
- Powershell -NoProfile -ExecutionPolicy RemoteSigned -Command ". ([ScriptBlock]::Create((Get-Content -LiteralPath \"%~0\" -ReadCount 0 | Out-String ))) "
- popd
- pause
- exit /b
- #>
- $VerbosePreference = "Continue"
- $inputFile = "单词列表.txt"
- $outputFile = "结果.txt"
- Get-Content -LiteralPath $inputFile -OutBuffer 100 | ForEach-Object -Begin {
- $sw = New-Object -TypeName System.IO.StreamWriter -ArgumentList $outputFile, $false, ([System.Text.Encoding]::Default), 65536
- # webclient settings
- $webclient = New-Object -TypeName System.Net.WebClient
- $webclient.BaseAddress = 'http://dict.cn/'
- $webclient.Encoding = [System.Text.Encoding]::UTF8
- $webclient.Headers.Add("Accept", "text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8")
- $webclient.Headers.Add("Accept-Encoding", "gzip")
- $webclient.Headers.Add("Accept-Language", "en-US, en; q=0.8, zh-Hans-CN; q=0.5, zh-Hans; q=0.3")
- # xml parser
- $xmldoc = New-Object -TypeName System.Xml.XmlDocument
- # re
- $remain = [regex]'(?si)<div\s+class="main"[^>]*>.*?(?=<div\s+class="righter")'
- # $recontent = [regex]'(?si)<div\s+class="word"[^>]*>.*?(?=<div\s+class="section[^"]*"[^>]*>)'
- $rejs = [regex]'(?si)<script[^>]*>.*?</script>'
- # $resent = [regex]'(?si)(?<=<h3[^>]*>例句</h3>).*?(?=<h3)'
- $reXMLEntities = [regex]'(?si)&[^;<]*(;|(?=<|$))'
- $evaluator = {
- param($m)
- $s = $m.Value;
- if ($m.Groups[1].Value -eq '') {
- $s += ';'
- }
- [System.Web.HttpUtility]::HtmlEncode([System.Web.HttpUtility]::HtmlDecode($s))
- } -as [System.Text.RegularExpressions.MatchEvaluator]
- # stringbuilder
- $strbuilder = New-Object -TypeName System.Text.StringBuilder
- Add-Type -AssemblyName System.Web.Extensions
- $JSON = New-Object -TypeName System.Web.Script.Serialization.JavascriptSerializer -ErrorAction Stop
- # $vsaengine = [Microsoft.JScript.Vsa.VsaEngine]::CreateEngine()
- Add-Type -AssemblyName System.Web
- } -Process {
- Write-Verbose "Fetching $_ ..."
- for ($i = 2; $i -ge 0; $i--) {
- try {
- $readstream = $webclient.OpenRead($_)
- Write-Verbose "Fetch $_ success"
- break
- } catch {
- $_ | Out-String | Write-Host -ForegroundColor Red
- }
- }
- if ($readstream) {
- try {
- $gzipstream = New-Object -TypeName System.IO.Compression.GZipStream -ArgumentList $readstream, ([System.IO.Compression.CompressionMode]::Decompress)
- $sr = New-Object -TypeName System.IO.StreamReader -ArgumentList $gzipstream, ([System.Text.Encoding]::UTF8)
- $strhtml = $sr.ReadToEnd()
- $match = $remain.Match($strhtml)
- # div.main matched
- if ($match.Success) {
- Write-Verbose "Match $_ success"
- # convert html to xml
- $strxml = $rejs.Replace($match.Value, '') -replace '(?s)<!--.*?-->' -replace '<([^\x00-\x7e]+)>', '<$1>' -replace '<br>', '<br/>'
- $strxml = $reXMLEntities.Replace($strxml, $evaluator)
- $xmldoc.LoadXml($strxml)
- $strbuilder.Length = 0
- $nodeWord = $xmldoc.DocumentElement.SelectSingleNode('div[@class="word"]')
- # word-cont
- [void]$strbuilder.Append($nodeWord.SelectSingleNode('.//h1[@class="keyword"]/text()').Value).Append("`t")
- # dict-translation
- [void]$strbuilder.Append( ($nodeWord.SelectNodes('.//ul/li[position()<last()]') | ForEach-Object { $_.innerText }) -join " " ).Append("`t")
- # dict-chart
- $nodeChartBasic = $nodeWord.SelectSingleNode('.//div[@id="dict-chart-basic"]/@data')
- # chart basic exist
- if ($nodeChartBasic) {
- $strjson = [System.Uri]::UnescapeDataString($nodeChartBasic.Value)
- $jsobj = $JSON.DeserializeObject($strjson)
- foreach ($field in $jsobj.Keys) {
- [void]$strbuilder.Append($jsobj.Item($field).Item('sense')).Append(':').Append($jsobj.Item($field).Item('percent')).Append(',')
- }
- [void]$strbuilder.Remove($strbuilder.Length - 1, 1)
- }
- [void]$strbuilder.Append("`t")
-
- # 例句
- # $match1 = $resent.Match($strhtml)
- $nodeSent = $xmldoc.DocumentElement.SelectSingleNode('div[@class="section sent"]/h3[text()="例句"]/following-sibling::div')
- if ($nodeSent) {
- try {
- # $xmldoc.LoadXml(($match1.Value -replace '<br>', '<br/>' -replace '<([^\x00-\x7e]+)>', '<$1>'))
- $nodeSent.SelectNodes('ol') | ForEach-Object {
- # 用作形容词 (.adj)
- [void]$strbuilder.Append(($_.PreviousSibling.InnerText -replace '\s+' -replace '\(', ' $&')).Append('<br>')
- $_.SelectNodes('li[position()<3]') | ForEach-Object -Begin { $index = 0 } -Process {
- $index++;
- [void]$strbuilder.Append("$index.$(($_.SelectNodes('text()')|ForEach-Object {$_.Value}) -join '<br>')").Append('<br>')
- }
- }
- [void]$strbuilder.Remove($strbuilder.Length - 4, 4)
- } catch {
- $_ | Out-String | Write-Host -ForegroundColor Red
- }
- } else {
- Write-Verbose "没有例句."
- }
- # output result string
- $sw.WriteLine(($strbuilder.ToString() -replace "[\r\n]+"))
- } else {
- Write-Verbose "Match $_ failed"
- }
- } catch {
- $_ | out-string | Write-Host -ForegroundColor Red
- } finally {
- $readstream.Close()
- Remove-Variable readstream
- if ($gzipstream) {
- $gzipstream.Dispose()
- }
- if ($sr) {
- $sr.Dispose()
- }
- }
- } else {
- $sw.WriteLine($_)
- }
- } -End {
- $sw.Dispose()
- $webclient.Dispose()
- }
复制代码
|