众所周知,IF命令的排序规则既不是按照GBK编码的顺序,也不是按照Unicode编码的顺序,而是有着自己的规则,这个规则是什么呢?
在CMD内部,IF命令是调用lstrcmpW函数来比较字符串大小的(《批处理技术内幕:IF命令》),IF命令的比较规则即lstrcmpW函数的比较规则。
lstrcmp(Locale String Compare)函数的排序是与系统的语言与区域设置有关的(参考《Windows 代码页与字符顺序》)。
但是具体怎么排序MSDN却没有说明(至少我没有找到),为了弄清楚默认情况下IF的比较顺序,我写了一个简单C程序: | #include <stdio.h> | | #include <stdlib.h> | | #include <string.h> | | #include <windows.h> | | | | | | | | | | | | | | | | #define BUFFER_SIZE 1024 | | | | typedef struct _table { | | int cp936; | | wchar_t *unicode; | | char *name; | | } table; | | | | int compare(const void *a, const void *b); | | | | int main() | | { | | table *a; | | int n = 0, i = 0; | | wchar_t *p; | | char buf[BUFFER_SIZE], *p1, *p2; | | FILE *fp1, *fp2; | | | | | | fp1 = fopen("CP936.txt", "rb"); | | if (fp1 == NULL) { | | fprintf(stderr, "Can't open CP936.txt\n"); | | return 1; | | } | | | | | | while (!feof(fp1) && fgets(buf, BUFFER_SIZE, fp1)) { | | if (strlen(buf) == 0 || buf[0] == '#') continue; | | if (p1 = strchr(buf, '\t')) *p1++ = '\0'; | | if (p2 = strchr(p1, '\t')) *p2++ = '\0'; | | while (isspace(*p1)) p1++; | | if (!*p1) continue; | | n++; | | } | | | | | | a = (table *) malloc(n * sizeof(table)); | | | | | | fp1 = freopen("CP936.txt", "rb", fp1); | | if (fp1 == NULL) { | | fprintf(stderr, "Can't reopen CP936.txt\n"); | | return 1; | | } | | | | | | while (!feof(fp1) && fgets(buf, BUFFER_SIZE, fp1)) { | | if (strlen(buf) == 0 || buf[0] == '#') continue; | | if (p1 = strchr(buf, '\t')) *p1++ = '\0'; | | if (p2 = strchr(p1, '\t')) *p2++ = '\0'; | | while (isspace(*p1)) p1++; | | if (!*p1) continue; | | p = (wchar_t *) malloc(2 * sizeof(wchar_t)); | | p[0] = (wchar_t) strtol(p1, NULL, 16); | | p[1] = 0x0000; | | a[i].cp936 = strtol(buf, NULL, 16); | | a[i].unicode = p; | | a[i].name = strdup(p2); | | i++; | | } | | | | | | qsort(a, n, sizeof(table), compare); | | | | | | fp2 = fopen("CP936_SORT.txt", "wb"); | | if (fp2 == NULL) { | | fprintf(stderr, "Can't open CP936_SORT.txt\n"); | | return 1; | | } | | | | | | for (i = 0; i < n; i++) { | | fprintf(fp2, "0x%02X\t0x%04X\t%s", a[i].cp936, a[i].unicode[0], a[i].name); | | } | | | | | | for (i = 0; i < n; i++) { | | free(a[i].unicode); | | free(a[i].name); | | } | | free(a); | | | | | | fclose(fp1); | | fclose(fp2); | | return 0; | | } | | | | | | int compare(const void *a, const void *b) | | { | | wchar_t *s1 = ((table *)a)->unicode; | | wchar_t *s2 = ((table *)b)->unicode; | | return lstrcmpW(s1, s2); | | }COPY |
CP936.TXT可以到Unicode官方网站下载到(http://unicode.org/Public/MAPPIN ... T/WINDOWS/CP936.TXT)。
程序运行后会生成CP936_SORT.txt,里面是排序后的CP936到Unicode的映射表,
第一列是GBK码,第二列是对应的Unicode代码点(Code Point),第三列是字符的Unicode名称。
不想自己编译的话可以下载我编译好的EXE: |