看到这个贴子http://www.bathome.net/thread-69920-1-1.html有字符串相似的相关问题,
于是结合编辑距离和公共子串序相关的算法写了三种方法计算字符相似度,各有侧重- @echo off
- call :sim "abcd你好123" "bd好运1314"
- call :sim "abcd你好123" "abcd你好132&"
- pause&exit
-
- :sim
- if "%~1"=="" if not "%~2"=="" (echo,---0%%---&exit /b)
- if "%~2"=="" if not "%~1"=="" (echo,---0%%---&exit /b)
- if "%~2"=="" if "%~1"=="" (echo,---100%%---&exit /b)
- set "str1=%~1"&set "_str1=%~1#"
- set "str2=%~2"&set "_str2=%~2#"
- setlocal enabledelayedexpansion
- for %%i in (2048 1024 512 256 128 64 32 16 8 4 2 1) do (
- if not "!_str1:~%%i,1!"=="" (set /a "len1+=%%i"&set "_str1=!_str1:~%%i!")
- if not "!_str2:~%%i,1!"=="" (set /a "len2+=%%i"&set "_str2=!_str2:~%%i!")
- )
- set /a _len1_=len1+1,_len2_=len2+1
-
- setlocal
- for /l %%i in (0,1,%_len1_%) do set /a count[%%i][0]=%%i
- for /l %%i in (0,1,%_len2_%) do set /a count[0][%%i]=%%i
- for /l %%i in (1,1,%_len1_%) do (
- for /l %%j in (1,1,%_len2_%) do (
- set /a ci=%%i-1,cj=%%j
- set /a c1=count[!ci!][!cj!]+1
- set /a ci=%%i,cj=%%j-1
- set /a c2=count[!ci!][!cj!]+1
- set /a ci=%%i-1,cj=%%j-1
- set /a c3=count[!ci!][!cj!]
- set /a ii=%%i-1,jj=%%j-1
- call :cut !ii! !jj!
- if not "!s1!"=="!s2!" set /a c3+=1
- if !c1! leq !c2! (set /a min=c1) else set /a min=c2
- if !c3! leq !min! (set /a count[%%i][%%j]=c3) else set /a count[%%i][%%j]=min
- )
- )
- set /a dist=count[%len1%][%len2%]
- endlocal&set /a dist=%dist%
-
- setlocal
- for /l %%i in (1,1,%_len1_%) do (
- for /l %%j in (1,1,%_len2_%) do (
- set /a ii=%%i-1,jj=%%j-1
- call :cut !ii! !jj!
- set /a ci=%%i-1,cj=%%j-1
- if "!s1!"=="!s2!" (
- set /a count[%%i][%%j]=count[!ci!][!cj!]+1
- ) else (
- set /a c1=count[!ci!][!cj!]
- set /a ci=%%i-1,cj=%%j
- set /a c2=count[!ci!][!cj!]
- set /a ci=%%i,cj=%%j-1
- set /a c3=count[!ci!][!cj!]
- if !c1! geq !c2! (set /a max=c1) else set /a max=c2
- if !c3! geq !max! (set /a count[%%i][%%j]=c3) else set /a count[%%i][%%j]=max
- )
- )
- )
- set /a LCS=count[%len1%][%len2%]
- endlocal&set /a LCS=%LCS%
-
- setlocal
- set /a matches=0
- if %len1% geq %len2% (
- set "maxStr=!str1!"&set "minStr=!str2!"
- set /a maxLen=len1,minLen=len2
- ) else (
- set "maxStr=!str2!"&set "minStr=!str1!"
- set /a maxLen=len2,minLen=len1
- )
- set /a "match_max=maxLen/2-1"
- if !match_max! leq 0 set /a match_max=0
- set /a _minLen_=minLen-1
- for /l %%i in (0,1,%_minLen_%) do (
- set "minChar=!minStr:~%%i,1!"
- set /a jj=%%i-match_max
- if !jj! leq 0 set /a jj=0
- set /a jj_lim=%%i+match_max+1
- if !jj_lim! geq !maxLen! set /a jj_lim=maxLen
- set /a jj_lim-=1
- for /l %%j in (!jj!,1,!jj_lim!) do (
- set /a maxflag[%%j]=maxflag[%%j]
- set "maxChar=!maxStr:~%%j,1!"
- if !flag[%%j]! equ 0 if "!minChar!"=="!maxChar!" (set /a maxflag[%%j]=1,minflag[%%i]=1,matches+=1)
- )
- )
- set /a trans=jj=0
- for /l %%i in (0,1,%_minLen_%) do (
- if !minflag[%%i]! equ 1 (
- for /l %%j in (!jj!,1,!jj_lim!) do if "!maxflag[%%j]!" equ 0 (set /a jj+=1)
- for %%j in (!jj!) do if not "!minStr:~%%i,1!"=="!maxStr:~%%j,1!" (set /a trans+=1)
- set /a jj+=1
- )
- )
- set /a trans/=2
- if %matches% equ 0 (endlocal&set/a simPCT3=0&goto :end)
- set /a "jaro=(matches*100/len1+matches*100/len2+((matches-trans)*100/matches))/3"
- endlocal&set /a simPCT3=%jaro%
- :end
- if %len1% geq %len2% (set /a maxLen=len1) else set /a maxLen=len2
- set /a "simPCT1=LCS*100/maxLen"
- set /a "simPCT2=LCS*100/(%dist%+%LCS%)"
- echo,---%simPCT1%%%---%simPCT2%%%---%simPCT3%%%---"!str1!"与"!str2!"相似度
- exit /b
- :cut
- set "s1=!str1:~%1,1!"
- set "s2=!str2:~%2,1!"
复制代码
|