标题: [文本处理] 批处理计算字符串相似度 [打印本页]
作者: buyiyang 时间: 2024-10-11 13:28 标题: 批处理计算字符串相似度
看到这个贴子http://www.bathome.net/thread-69920-1-1.html有字符串相似的相关问题,
于是结合编辑距离和公共子串序相关的算法写了三种方法计算字符相似度,各有侧重- @echo off
- call :sim "abcd你好123" "bd好运1314"
- call :sim "abcd你好123" "abcd你好132&"
- pause&exit
-
- :sim
- if "%~1"=="" if not "%~2"=="" (echo,---0%%---&exit /b)
- if "%~2"=="" if not "%~1"=="" (echo,---0%%---&exit /b)
- if "%~2"=="" if "%~1"=="" (echo,---100%%---&exit /b)
- set "str1=%~1"&set "_str1=%~1#"
- set "str2=%~2"&set "_str2=%~2#"
- setlocal enabledelayedexpansion
- for %%i in (2048 1024 512 256 128 64 32 16 8 4 2 1) do (
- if not "!_str1:~%%i,1!"=="" (set /a "len1+=%%i"&set "_str1=!_str1:~%%i!")
- if not "!_str2:~%%i,1!"=="" (set /a "len2+=%%i"&set "_str2=!_str2:~%%i!")
- )
- set /a _len1_=len1+1,_len2_=len2+1
-
- setlocal
- for /l %%i in (0,1,%_len1_%) do set /a count[%%i][0]=%%i
- for /l %%i in (0,1,%_len2_%) do set /a count[0][%%i]=%%i
- for /l %%i in (1,1,%_len1_%) do (
- for /l %%j in (1,1,%_len2_%) do (
- set /a ci=%%i-1,cj=%%j
- set /a c1=count[!ci!][!cj!]+1
- set /a ci=%%i,cj=%%j-1
- set /a c2=count[!ci!][!cj!]+1
- set /a ci=%%i-1,cj=%%j-1
- set /a c3=count[!ci!][!cj!]
- set /a ii=%%i-1,jj=%%j-1
- call :cut !ii! !jj!
- if not "!s1!"=="!s2!" set /a c3+=1
- if !c1! leq !c2! (set /a min=c1) else set /a min=c2
- if !c3! leq !min! (set /a count[%%i][%%j]=c3) else set /a count[%%i][%%j]=min
- )
- )
- set /a dist=count[%len1%][%len2%]
- endlocal&set /a dist=%dist%
-
- setlocal
- for /l %%i in (1,1,%_len1_%) do (
- for /l %%j in (1,1,%_len2_%) do (
- set /a ii=%%i-1,jj=%%j-1
- call :cut !ii! !jj!
- set /a ci=%%i-1,cj=%%j-1
- if "!s1!"=="!s2!" (
- set /a count[%%i][%%j]=count[!ci!][!cj!]+1
- ) else (
- set /a c1=count[!ci!][!cj!]
- set /a ci=%%i-1,cj=%%j
- set /a c2=count[!ci!][!cj!]
- set /a ci=%%i,cj=%%j-1
- set /a c3=count[!ci!][!cj!]
- if !c1! geq !c2! (set /a max=c1) else set /a max=c2
- if !c3! geq !max! (set /a count[%%i][%%j]=c3) else set /a count[%%i][%%j]=max
- )
- )
- )
- set /a LCS=count[%len1%][%len2%]
- endlocal&set /a LCS=%LCS%
-
- setlocal
- set /a matches=0
- if %len1% geq %len2% (
- set "maxStr=!str1!"&set "minStr=!str2!"
- set /a maxLen=len1,minLen=len2
- ) else (
- set "maxStr=!str2!"&set "minStr=!str1!"
- set /a maxLen=len2,minLen=len1
- )
- set /a "match_max=maxLen/2-1"
- if !match_max! leq 0 set /a match_max=0
- set /a _minLen_=minLen-1
- for /l %%i in (0,1,%_minLen_%) do (
- set "minChar=!minStr:~%%i,1!"
- set /a jj=%%i-match_max
- if !jj! leq 0 set /a jj=0
- set /a jj_lim=%%i+match_max+1
- if !jj_lim! geq !maxLen! set /a jj_lim=maxLen
- set /a jj_lim-=1
- for /l %%j in (!jj!,1,!jj_lim!) do (
- set /a maxflag[%%j]=maxflag[%%j]
- set "maxChar=!maxStr:~%%j,1!"
- if !flag[%%j]! equ 0 if "!minChar!"=="!maxChar!" (set /a maxflag[%%j]=1,minflag[%%i]=1,matches+=1)
- )
- )
- set /a trans=jj=0
- for /l %%i in (0,1,%_minLen_%) do (
- if !minflag[%%i]! equ 1 (
- for /l %%j in (!jj!,1,!jj_lim!) do if "!maxflag[%%j]!" equ 0 (set /a jj+=1)
- for %%j in (!jj!) do if not "!minStr:~%%i,1!"=="!maxStr:~%%j,1!" (set /a trans+=1)
- set /a jj+=1
- )
- )
- set /a trans/=2
- if %matches% equ 0 (endlocal&set/a simPCT3=0&goto :end)
- set /a "jaro=(matches*100/len1+matches*100/len2+((matches-trans)*100/matches))/3"
- endlocal&set /a simPCT3=%jaro%
- :end
- if %len1% geq %len2% (set /a maxLen=len1) else set /a maxLen=len2
- set /a "simPCT1=LCS*100/maxLen"
- set /a "simPCT2=LCS*100/(%dist%+%LCS%)"
- echo,---%simPCT1%%%---%simPCT2%%%---%simPCT3%%%---"!str1!"与"!str2!"相似度
- exit /b
- :cut
- set "s1=!str1:~%1,1!"
- set "s2=!str2:~%2,1!"
复制代码
作者: flashercs 时间: 2024-10-11 19:31
搞算法的都是牛人.
作者: buyiyang 时间: 2024-10-11 19:56
回复 2# flashercs
哈哈,是的,都是那帮牛人研究出来的。算法的本质就是数学,研究算法会涉及很多数学和数据结构方面的东西。
作者: flashercs 时间: 2024-10-11 20:04
我看不懂这算法,我这老年人也搞不了算法了...
欢迎光临 批处理之家 (http://bbs.bathome.net/) |
Powered by Discuz! 7.2 |