Board logo

标题: [原创] VBS / JS / Javascript 使用OpenCC词库实现词汇级别的繁简转换(By Yu2n) [打印本页]

作者: yu2n    时间: 2014-4-6 19:19     标题: VBS / JS / Javascript 使用OpenCC词库实现词汇级别的繁简转换(By Yu2n)

本帖最后由 yu2n 于 2014-4-6 19:26 编辑

VBS 使用OpenCC词库实现词汇级别的繁简转换(By Yu2n)
  
示例:词汇级别简转繁

  输入:
    中华人民共和国是工人阶级领导的、以工农联盟为基础的人民****的社会主义国家。
    干活 干杯  西太后   后天
    划过来
   
  输出:
    中華人民共和國是工人階級領導的、以工農聯盟爲基礎的人民**專政的社會主義国家。
    幹活 乾杯  西太后   後天
    划過來

VBS / JS / Javascript 使用OpenCC词库实现词汇级别的繁简转换(By Yu2n)

源码下载:http://yu2n.sinaapp.com/file/?dir=/file/tools/OpenCC/

OpenCC-WSH.vbs
  1. Dim dictionary_path, TSCharacters, TSPhrases, STCharacters, STPhrases
  2. dictionary_path = Left(WScript.ScriptFullName, Len(WScript.ScriptFullName) - Len(WScript.ScriptName))
  3. TSCharacters = ReadText(dictionary_path + "TSCharacters.txt")
  4. TSPhrases = ReadText(dictionary_path + "TSPhrases.txt")
  5. STCharacters = ReadText(dictionary_path + "STCharacters.txt")
  6. STPhrases = ReadText(dictionary_path + "STPhrases.txt")
  7. ' 测试
  8. Call Test()
  9. Function Test()
  10.   Dim str1, str2
  11.   str1 = "中华人民共和国是工人阶级领导的、以工农联盟为基础的人民****的社会主义国家。"
  12.   str1 = str1  & vbCrLf & "干活 干杯  西太后   后天"
  13.   str1 = str1  & vbCrLf & "划过来"
  14.   str2 = "中華人民共和國是工人階級領導的、以工農聯盟爲基礎的人民**專政的社會主義国家。"
  15.   str2 = str2  & vbCrLf & "幹活 乾杯  西太后   後天"
  16.   str2 = str2  & vbCrLf & "划過來"
  17.   
  18.   WScript.Echo Now() & vbTab & " TC2SC() "
  19.   WScript.Echo Now() & VbCrLf & str1 & vbCrLf & " ==> "
  20.   WScript.Echo Now() & VbCrLf & SC2TC(str1) & vbCrLf & vbCrLf
  21.   
  22.   WScript.Echo Now() & vbTab & " TC2SC() "
  23.   WScript.Echo Now() & VbCrLf & str2 & vbCrLf & " ==> "
  24.   WScript.Echo Now() & VbCrLf & TC2SC(str2)
  25. End Function
  26. ' 载入词库是否完成
  27. Function isInitDic()
  28.   isInit = Not (STCharacters="" Or STPhrases="" Or TSCharacters="" Or TSPhrases="")
  29.   If Not isInit Then WScript.Echo("正在载入词库,请稍等……")
  30.   isInitDic = isInit
  31. End Function
  32. ' 简转繁
  33. Function SC2TC(str)
  34.   If (isInitDic()) Then
  35.     SC2TC = TCSCConverter(STPhrases & vbLf & STCharacters, str)
  36.   Else
  37.     SC2TC = ""
  38.   End If
  39. End Function
  40. ' 繁转简
  41. Function TC2SC(str)
  42.   If (isInitDic()) Then
  43.     TC2SC = TCSCConverter(TSPhrases & vbLf & TSCharacters, str)
  44.   Else
  45.     TC2SC = ""
  46.   End If
  47. End Function
  48. ' 使用 OpenCC 词库转换
  49. Function TCSCConverter(strDictionaryOpenCC, strSrc)
  50.   ' 词组库排序替换:按字符串长度降序排序
  51.   arr_Phrases = Split(strDictionaryOpenCC, vbLf)
  52.   Call OpenCC_Dic_Sort(arr_Phrases)     ' 将数组按字符串长度降序排序
  53.   
  54.   ' 词组替换
  55.   Dim i, arr_find()
  56.   ReDim Preserve arr_find(0)
  57.   For i = 0 To UBound(arr_Phrases)
  58.     If (InStr(arr_Phrases(i), vbTab) > 1) And (Len(arr_Phrases(i)) >=3) Then
  59.       Dim str_SrcPhrases, str_DesPhrases
  60.       str_SrcPhrases = Split(arr_Phrases(i), vbTab)(0)
  61.       str_DesPhrases = Split(arr_Phrases(i), vbTab)(1)
  62.       If (InStr(strSrc, str_SrcPhrases) > 0) And (str_SrcPhrases <> "" ) Then
  63.         ReDim Preserve arr_find(UBound(arr_find)+1)
  64.         If (InStr(str_DesPhrases, " ") > 1) And (Len(str_DesPhrases) >= 3) Then
  65.           arr_find(UBound(arr_find)) = "<[?" & UBound(arr_find) & "?]>" & vbTab & Split(str_DesPhrases," ")(0)
  66.         Else
  67.           arr_find(UBound(arr_find)) = "<[?" & UBound(arr_find) & "?]>" & vbTab & str_DesPhrases
  68.         End If
  69.         strSrc = Replace(strSrc, str_SrcPhrases, "<[?" & UBound(arr_find) & "?]>")  ' 增加替换标记
  70.       End If
  71.     End If
  72.   Next
  73.   
  74.   ' 还原替换标记
  75.   For i = 0 To UBound(arr_find)
  76.     If (InStr(arr_find(i), vbTab) > 1) And (Len(arr_find(i)) >=8) Then
  77.       'WScript.Echo Split(arr_find(i), vbTab)(0) & vbTab & Split(arr_find(i), vbTab)(1)
  78.       
  79.       If (InStr(strSrc, Split(arr_find(i), vbTab)(0)) > 0) Then
  80.         If Split(arr_find(i), vbTab)(1) <> "" Then
  81.           strSrc = Replace(strSrc, Split(arr_find(i), vbTab)(0), Split(arr_find(i), vbTab)(1))  ' 还原替换标记
  82.         End If
  83.       End If
  84.       
  85.     End If
  86.   Next
  87.   
  88.   TCSCConverter = strSrc
  89. End Function
  90. ' 词组库排序替换:按字符串长度降序排序
  91. Function OpenCC_Dic_Sort(ByRef arr)
  92.   Dim arrTable()
  93.   ReDim Preserve arrTable(UBound(arr),1)
  94.   Dim intRow
  95.   For intRow = 0 To UBound(arr)
  96.     arrTable(intRow,0) = Len(arr(intRow))
  97.     arrTable(intRow,1) = arr(intRow)
  98.   Next
  99.   Call arrSort(arrTable, 0)
  100.   For intRow = 0 To UBound(arr)
  101.     arr(intRow) = arrTable(intRow,1)
  102.   Next
  103. End Function
  104. ' 二维数组排序:14W纪录级别
  105. 'arr           --待排序二维数组
  106. 'intSortField  --待排序字段索引
  107. Function arrSort(ByRef arr, ByVal intSortField)
  108.   Const adOpenStatic = 3
  109.   Const adUseClient = 3
  110.   Const adDouble = 5
  111.   Const adVarChar = 200
  112.   Const adLongVarWChar = 203 ' DataTypeEnum: adLongVarWChar -- Indicates a long null-terminated Unicode string value.
  113.   Set rs = CreateObject("ADODB.Recordset")
  114.   rs.CursorLocation = adUseClient
  115.   
  116.   Dim intRow, intCol
  117.   '给记录集添加字段,以前缀+索引的形式
  118.   For intCol = 0 To UBound(arr, 2)
  119.     If intCol = intSortField Then
  120.       rs.Fields.append intCol, adDouble ' 数字
  121.     Else
  122.       rs.Fields.append intCol, adLongVarWChar, 1024
  123.     End If
  124.   Next
  125.   rs.CursorType = adOpenStatic
  126.   rs.Open
  127.   
  128.   '将数组插入进记录中
  129.   For intRow = 0 To UBound(arr, 1)
  130.     rs.AddNew
  131.     For intCol = 0 To UBound(arr, 2)
  132.       If intCol = intSortField Then
  133.         rs(intCol) = CDBl(arr(intRow, intCol))
  134.       Else
  135.         rs(intCol) = arr(intRow, intCol)
  136.       End If
  137.     Next
  138.     rs.Update
  139.   Next
  140.   
  141.   '设置排序字段
  142.   rs.Sort = rs(intSortField).Name & " DESC"
  143.   rs.MoveFirst
  144.   '将排好序的数据重新赋给数组
  145.   For intRow = 0 To UBound(arr, 1)
  146.     For intCol = 0 To UBound(arr, 2)
  147.       arr(intRow, intCol) = rs(intCol)
  148.     Next
  149.     rs.MoveNext
  150.   Next
  151. End Function
  152. ' 使用 utf-8 编码读写文本文件
  153. Function ReadText(FileName)
  154.   ReadText = Pfile(FileName, "utf-8", "ForReading", "")
  155. End Function
  156. Function SaveText(FileName, TextString)
  157.   SaveText = Pfile(FileName, "utf-8", "ForWriting", TextString)
  158. End Function
  159. Function SaveWSH(FileName, TextString)
  160.   SaveWSH = Pfile(FileName, "Unicode", "ForWriting", TextString)
  161. End Function
  162. Function LogText(FileName, TextString)
  163.   LogText = Pfile(FileName, "utf-8", "ForAppending", TextString)
  164. End Function
  165. Function Pfile(FileName, FileCode, strType, TextString)
  166.   Set fso = CreateObject("Scripting.FileSystemObject")
  167.   Set objStream = CreateObject("ADODB.Stream")
  168.   objStream.Type = 2
  169.   objStream.Mode = 3
  170.   objStream.Charset = FileCode     '不同编码时自己换,Chinese (Simplified) (GB2312),中文 GBK ,繁体中文 Big5 ,日文 EUC-JP ,韩文 EUC-KR,charset=UTF-8(国际化编码),ANSI,Unicode,unicode big endian
  171.   If (fso.FileExists(FileName)) Then
  172.     objStream.Open()
  173.     objStream.LoadFromFile FileName
  174.     if (strType = "ForReading") Then TextString = objStream.ReadText()
  175.     if (strType = "ForAppending") Then TextString = TextString & objStream.ReadText()
  176.     objStream.Close()
  177.   End If
  178.   If (strType = "ForWriting") Or (strType ="ForAppending") Then
  179.     objStream.Open()
  180.     objStream.WriteText TextString
  181.     objStream.SaveToFile FileName, 2
  182.     objStream.Close()
  183.   End If
  184.   Set objStream = Nothing
  185.   If (strType = "ForReading") Then Pfile = TextString
  186.   If (strType = "ForWriting") Or (strType = "ForAppending") Then Pfile = True
  187. End Function
复制代码





欢迎光临 批处理之家 (http://bbs.bathome.net/) Powered by Discuz! 7.2