本帖最后由 happy886rr 于 2017-10-25 23:27 编辑
[词频、字频统计工具trie]
这是继文本工具tl之后的又一文本第三方,统计多国语言词频、字频、单词频率。可以智能的做前缀是否去重,智能粒进匹配。智能切词。速度比同类软件快100倍。
存下图为a.zip解压便是(外链图有效期仅7天,过期不再提供任何下载)

用法:
-------------------------------------------------------------------------
trie u:[统计字典文件] <[待统计文本流]>[输出文本流]
echo hello world|trie -u:C1.DI
trie +u:C2.DI <in.txt>out.txt
trie u:S2_IDIOM.DI <in.txt>out.txt
trie m
注:u前面可以加正负号,表示正逆序排序输出统计表。
-------------------------------------------------------------------------
原创代码: | | | | | | | | | | | | | | | | | | | | | | | #include <stdio.h> | | #include <stdlib.h> | | #include <string.h> | | #include <time.h> | | | | | | #define HELP_INFORMATION "\ | | Trie v1.0 - Console txt character statistics tool\n\ | | Usage:\n\ | | trie [option] <[infile]>[outfile]\n\ | | \n\ | | General options:\n\ | | u:[userdic] Not sorting statistics result\n\ | | +u:[userdic] Sorting result order by ASC\n\ | | -u:[userdic] Sorting result order by DESC\n\ | | \n\ | | You can use switch 'm' to make a user dictionary head file\n\ | | " | | | | #define DICFILE_HEAD "\ | | /|||\n\ | | TRIE - USER DICTIONARY TABLE 1.0\n\ | | ==============================================================\n\ | | DICTIONARY NAME: EMPTY\n\ | | MADE DATE: 1970-01-01\n\ | | CHARSET: ANSI/UTF8\n\ | | MAX SHOW:\n\ | | ==============================================================\n\ | | BYTE LOW: 0\n\ | | BYTE HIGH: 255\n\ | | INSERT MIN: 1\n\ | | INSERT MAX: 8\n\ | | STEP FORWARD MIN: 1\n\ | | STEP FORWARD MAX: 2\n\ | | ==============================================================\n\ | | |||/\n\ | | " | | #ifndef byte | | typedef unsigned char byte; | | #endif | | | | #ifndef bool | | typedef enum {false, true} bool; | | #endif | | | | | | #define BYTE_EOF 0xFF | | | | #define LINE_BUFF_SIZE (1024 * 64) | | | | #define MAX_JOIN_SIZE 32 | | | | #define SHOW_ARRAY_SIZE (1024 * 10 * 64) | | | | #define TRIE_DEEP_SIZE 24 | | | | #define TRIE_VALUE_SIZE 1 | | | | | | static int trie_low_offset; | | static int trie_high_offset; | | static int trie_min_granularity; | | static int trie_max_granularity; | | static int trie_min_insert; | | static int trie_max_insert; | | static int use_english_mode; | | static int use_substring_match; | | | | | | static int trie_branch_size; | | | | static int* p_static_array; | | | | static int show_list_size; | | | | static int already_show_list; | | | | | | #define ISLETTER(x) ( ((0x41 <= (x)) && ((x) <= 0x5A)) || ((0x61 <= (x)) && ((x) <= 0x7A)) ) | | | | #define ISLOWERLETTER(x) ((0x61 <= (x)) && ((x) <= 0x7A)) | | | | #define ISUPPERLETTER(x) ((0x41 <= (x)) && ((x) <= 0x5A)) | | | | | | #define MAXVALUE(a,b) (((a)>(b))?(a):(b)) | | | | | | typedef enum | | { | | SORT_NO = 0, | | SORT_ASC = 1, | | SORT_DESC = -1 | | } SortMode; | | | | | | #define QSORT_STACK_SIZE 1024 | | | | #define SWAP(a,b) (((a)==(b))?0:((a)^=(b)^=(a)^=(b))) | | | | | | #define COMP(a,b,sortMode) (((a)-(b))*(sortMode)) | | | | | | bool QsortStaticArray(int* staticArray, SortMode sortMode, int left, int right) | | { | | | | if(right == left) | | { | | return true; | | } | | | | | | if( | | staticArray == NULL | | || left < 0 | | || right < 0 | | || left > right | | ) | | { | | return false; | | } | | | | | | int (*iArray)[2] = (int(*)[2])staticArray; | | | | | | int pStack[QSORT_STACK_SIZE][2], pStackIndex = 0; | | | | | | pStack[pStackIndex ][0] = left; | | pStack[pStackIndex ++][1] = right; | | | | | | while(pStackIndex > 0) | | { | | | | int i = pStack[-- pStackIndex][0]; | | int j = pStack[ pStackIndex][1]; | | | | if(i < j) | | { | | int ls = i, rs = j; | | | | | | if(j - i + 1 < 8) | | { | | int p, max; | | while (ls < rs) | | { | | max = ls; | | for (p = ls + 1; p <= rs; p ++) | | { | | if(COMP(iArray[p][0], iArray[max][0], sortMode) > 0) | | { | | max = p; | | } | | } | | SWAP(iArray[max][0], iArray[rs][0]); | | SWAP(iArray[max][1], iArray[rs][1]); | | rs--; | | } | | | | continue; | | } | | | | | | int mid = (j-i)/2 + i + 1; | | | | if(COMP(iArray[i][0], iArray[mid][0], sortMode) > 0) | | { | | SWAP(iArray[i][0], iArray[mid][0]); | | SWAP(iArray[i][1], iArray[mid][1]); | | } | | if(COMP(iArray[i][0], iArray[j][0], sortMode) > 0) | | { | | SWAP(iArray[i][0], iArray[j][0]); | | SWAP(iArray[i][1], iArray[j][1]); | | } | | if(COMP(iArray[mid][0], iArray[j][0], sortMode) > 0) | | { | | SWAP(iArray[mid][0], iArray[j][0]); | | SWAP(iArray[mid][1], iArray[j][1]); | | } | | SWAP(iArray[mid][0], iArray[i][0]); | | SWAP(iArray[mid][1], iArray[i][1]); | | | | | | int base[2] = {iArray[ls][0], iArray[ls][1]}; | | bool needsCheck = false; | | bool alreadyCheck = false; | | | | | | while(ls < rs) | | { | | while(ls < rs && COMP(iArray[rs][0], base[0], sortMode) >= 0) | | { | | rs--; | | } | | iArray[ls][0] = iArray[rs][0]; | | iArray[ls][1] = iArray[rs][1]; | | | | | | if(alreadyCheck == false) | | { | | if(rs == ls) | | { | | needsCheck = true; | | } | | } | | | | while(ls < rs && COMP(iArray[ls][0], base[0], sortMode) <= 0) | | { | | ls++; | | } | | iArray[rs][0] = iArray[ls][0]; | | iArray[rs][1] = iArray[ls][1]; | | | | | | if(alreadyCheck == false) | | { | | if(rs == ls) | | { | | needsCheck = true; | | } | | } | | | | alreadyCheck = true; | | } | | iArray[ls][0] = base[0]; | | iArray[ls][1] = base[1]; | | | | | | if(needsCheck == true) | | { | | int check = i; | | while(check < j && COMP(iArray[check][0], iArray[check + 1][0], sortMode) <= 0) | | { | | check ++; | | } | | if(check == j) | | { | | | | continue; | | } | | } | | | | | | int k = ls; | | if(i < k - 1) | | { | | | | pStack[pStackIndex ][0] = i; | | pStack[pStackIndex ++][1] = k-1; | | } | | | | if(k +1 < j) | | { | | | | pStack[pStackIndex ][0] = k+1; | | pStack[pStackIndex ++][1] = j; | | } | | } | | } | | return true; | | } | | | | | | typedef struct TrieNode | | { | | | | bool end; | | | | | | int value[TRIE_VALUE_SIZE]; | | | | | | struct TrieNode** branch; | | | | } TrieNode, *PTrieNode; | | | | | | PTrieNode PTrieNodeCreate(int branchSize) | | { | | | | PTrieNode pTrieNode = (PTrieNode)malloc(1 * sizeof(TrieNode)); | | | | if(pTrieNode == NULL) | | { | | | | fprintf(stderr, "Can't have enough memory\n"); | | exit(1); | | } | | | | | | pTrieNode->end = false; | | | | | | memset((void*)pTrieNode->value, (int)NULL, TRIE_VALUE_SIZE * sizeof(int)); | | | | | | pTrieNode->branch = (TrieNode**)calloc(branchSize, sizeof(TrieNode*)); | | | | if(pTrieNode->branch == NULL) | | { | | | | fprintf(stderr, "Can't have enough memory\n"); | | exit(1); | | } | | | | | | return pTrieNode; | | } | | | | | | void TrieFree(PTrieNode rootPTrieNode) | | { | | PTrieNode pTrie = rootPTrieNode; | | | | int i; | | for(i = 0; i < trie_branch_size; i ++) | | { | | if(pTrie->branch[i] != NULL) | | { | | TrieFree(pTrie->branch[i]); | | } | | } | | free(pTrie); | | } | | | | | | int TrieInsert(PTrieNode rootPTrieNode, byte* insertData, size_t insertDataLen) | | { | | | | if(insertDataLen >= TRIE_DEEP_SIZE) | | { | | | | fprintf(stderr, "The dictionary tree is inserted too deep\n"); | | exit(1); | | } | | | | PTrieNode pTrie = rootPTrieNode; | | byte* p = insertData; | | | | while(p - insertData < insertDataLen) | | { | | if(pTrie->branch[*p - trie_low_offset] == NULL) | | { | | | | pTrie->branch[*p - trie_low_offset] = PTrieNodeCreate(trie_branch_size); | | } | | | | | | pTrie = pTrie->branch[*p - trie_low_offset]; | | | | | | if(pTrie->end == false) | | { | | if((p + 1) - insertData == insertDataLen) | | { | | pTrie->end = true; | | break; | | } | | } | | | | | | p ++; | | } | | | | return 0; | | } | | | | | | typedef struct RetTrieStatic | | { | | | | int thisMatchedCount; | | | | | | int restBytesNumber; | | | | | | byte* restMatchPtr; | | } RetTrieStatic, *PRetTrieStatic; | | | | | | int TrieStatic(PTrieNode rootPTrieNode, byte* originalDataPtr, size_t originalDataLen, bool useSubstringMatch, bool useEnglishMode, bool isJoinMatch, PRetTrieStatic pRetTrieStatic, size_t minGranularity, size_t maxGranularity) | | { | | | | PTrieNode p_trie_stack[TRIE_DEEP_SIZE + 1]; | | | | | | int mathedCount = 0; | | | | int restDataLen = 0; | | | | | | byte* searchDataPtr = originalDataPtr; | | | | | | while(*searchDataPtr != BYTE_EOF) | | { | | | | while(*searchDataPtr != BYTE_EOF) | | { | | if( | | | | (! ((trie_low_offset <= *searchDataPtr) && (*searchDataPtr <= trie_high_offset))) | | | | | | || ((useEnglishMode) && (! ISLETTER(*searchDataPtr))) | | ) | | { | | searchDataPtr ++; | | } | | else | | { | | | | break; | | } | | } | | | | | | PTrieNode pTrie = rootPTrieNode; | | byte* p = searchDataPtr; | | | | | | int p_trie_index = 0; | | | | | | int matchLen = 0; | | bool matchAlready = false; | | | | | | while(*p != BYTE_EOF) | | { | | if( | | | | (!( | | | | (*p >= trie_low_offset) | | | | | | && (*p <= trie_high_offset) | | )) | | | | | | || (pTrie->branch[*p - trie_low_offset] == NULL) | | ) | | { | | | | if( | | (useEnglishMode) | | && (pTrie->branch[*p - trie_low_offset] == NULL) | | && (ISLETTER(*p)) | | ) | | { | | *p = ISUPPERLETTER(*p) ?(*p + 0x20) :(*p - 0x20); | | int V = (int)(*p) - trie_low_offset; | | | | | | if(! ((0 <= V) && (V < trie_branch_size) && (pTrie->branch[V] != NULL))) | | { | | break; | | } | | } | | else | | { | | break; | | } | | } | | | | | | pTrie = pTrie->branch[*p - trie_low_offset]; | | | | | | p_trie_stack[++ p_trie_index] = pTrie; | | | | | | if(pTrie->end == true) | | { | | | | if(useEnglishMode) | | { | | if(! ISLETTER(*(p+1))) | | { | | | | matchAlready = true; | | | | | | matchLen = p_trie_index; | | pTrie->value[0] ++; | | break; | | } | | } | | else | | { | | pTrie->value[0] ++; | | } | | } | | | | | | p ++; | | } | | | | | | while((p_trie_index > 0) && (! useEnglishMode)) | | { | | | | if((p_trie_stack[p_trie_index])->end == true) | | { | | | | if( | | | | (matchAlready) | | | | | | && ((p_trie_stack[p_trie_index]->value)[0] > 0) | | ) | | { | | | | (p_trie_stack[p_trie_index]->value)[0] --; | | } | | | | | | if(! matchAlready) | | { | | | | matchAlready = true; | | | | | | matchLen = p_trie_index; | | | | | | if(! useSubstringMatch) | | { | | break; | | } | | } | | } | | p_trie_index --; | | } | | | | | | if(matchAlready) | | { | | | | mathedCount ++; | | | | searchDataPtr += matchLen; | | } | | else | | { | | if(! useEnglishMode) | | { | | | | searchDataPtr += ((*p) < 0x80) ?minGranularity :maxGranularity; | | } | | else | | { | | | | while((*searchDataPtr != BYTE_EOF) && (ISLETTER(*searchDataPtr))) | | { | | searchDataPtr ++; | | } | | } | | } | | | | | | if(isJoinMatch) | | { | | | | if(*searchDataPtr != BYTE_EOF) | | { | | restDataLen = originalDataLen - (searchDataPtr - originalDataPtr); | | } | | else | | { | | restDataLen = 0; | | } | | | | | | if(restDataLen + 1 < MAX_JOIN_SIZE) | | { | | | | break; | | } | | } | | } | | | | | | pRetTrieStatic->thisMatchedCount = (int)mathedCount; | | pRetTrieStatic->restBytesNumber = (int)restDataLen; | | pRetTrieStatic->restMatchPtr = (searchDataPtr == originalDataPtr) ?NULL :(byte*)searchDataPtr; | | return 0; | | } | | | | | | static byte branch_stack[TRIE_DEEP_SIZE + 1] = {}; | | static int branch_stack_index = -1; | | | | | | int TriePrint(PTrieNode rootPTrieNode, const int* inArray, int totalMatchedNum) | | { | | | | branch_stack_index ++; | | | | | | PTrieNode pTrie = rootPTrieNode; | | | | int i; | | for(i = 0; i < trie_branch_size; i ++) | | { | | if(pTrie->branch[i] != NULL) | | { | | branch_stack[branch_stack_index] = (char)(i + (int)trie_low_offset); | | | | if((pTrie->branch[i])->end == true) | | { | | if((pTrie->branch[i])->value[0] > 0) | | { | | | | branch_stack[branch_stack_index + 1] = 0x00; | | | | | | if(inArray == NULL) | | { | | | | fprintf(stdout, "%8d %7.4f %s\n", (pTrie->branch[i])->value[0], (((pTrie->branch[i])->value[0] * 1.0 / totalMatchedNum) * 100.0), (char*)branch_stack); | | | | | | if( | | (show_list_size > 0) | | | | | | && ((++ already_show_list) >= show_list_size) | | ) | | { | | exit(0); | | } | | } | | | | else | | { | | | | byte* pShowStr = (byte*)malloc((branch_stack_index + 2) * sizeof(byte)); | | if(pShowStr == NULL) | | { | | | | fprintf(stderr, "Can't have enough memory\n"); | | exit(1); | | } | | | | | | memcpy((void*)pShowStr, (const void*)branch_stack, (size_t)(branch_stack_index + 2)); | | | | | | *(p_static_array ++) = (int)((pTrie->branch[i])->value[0]); | | *(p_static_array ++) = (int)pShowStr; | | | | | | if(((int)(p_static_array - inArray))/2 + 8 >= SHOW_ARRAY_SIZE) | | { | | | | fprintf(stderr, "Array out of bounds\n"); | | exit(1); | | } | | } | | } | | } | | | | | | TriePrint(pTrie->branch[i], inArray, totalMatchedNum); | | } | | } | | | | | | branch_stack_index --; | | return 0; | | } | | | | | | static const byte* dictionary_annotation_key[][2] = | | { | | {(byte*)((int*)&show_list_size), (byte*)"MAX SHOW:"}, | | {(byte*)((int*)&trie_low_offset), (byte*)"BYTE LOW:"}, | | {(byte*)((int*)&trie_high_offset), (byte*)"BYTE HIGH:"}, | | {(byte*)((int*)&trie_min_insert), (byte*)"INSERT MIN:"}, | | {(byte*)((int*)&trie_max_insert), (byte*)"INSERT MAX:"}, | | {(byte*)((int*)&trie_min_granularity), (byte*)"STEP FORWARD MIN:"}, | | {(byte*)((int*)&trie_max_granularity), (byte*)"STEP FORWARD MAX:"}, | | {(byte*)((int*)&use_english_mode), (byte*)"USE ENGLISH MODE:"}, | | {(byte*)((int*)&use_substring_match), (byte*)"USE SUBMATCH MODE:"}, | | {NULL, (byte*)"|||/"}, | | {NULL, (byte*)NULL} | | }; | | | | | | bool AlreadyGetStartupParameters(byte* pLineHead, byte* pLineEnd) | | { | | | | if(pLineHead == NULL || pLineEnd == NULL || pLineHead == pLineEnd) | | { | | return false; | | } | | | | | | byte tmp = *pLineEnd; | | | | *pLineEnd = 0x00; | | | | | | byte*** ptr = (byte***)dictionary_annotation_key; | | | | | | while((*(ptr + 1)) != NULL) | | { | | | | byte *lp = (byte*)pLineHead, *rp = (byte*)(*(ptr + 1)); | | while((*lp != 0x00) && (*rp != 0x00)) | | { | | if( | | (ISUPPERLETTER(*lp) ?(*lp + 0x20) :*lp) != (ISUPPERLETTER(*rp) ?(*rp + 0x20) :*rp) | | ) | | { | | break; | | } | | lp ++, rp ++; | | } | | | | | | if(*rp == 0x00) | | { | | | | if((*(ptr + 0)) == NULL) | | { | | | | *pLineEnd = tmp; | | | | | | return true; | | } | | | | | | int thisParameters = atoi((char*)((byte*)pLineHead + ((byte*)rp - (byte*)(*(ptr + 1))))); | | | | | | *((int*)(*(ptr + 0))) = (int)thisParameters; | | | | | | break; | | } | | | | | | ptr += sizeof(*dictionary_annotation_key) / sizeof(**dictionary_annotation_key); | | } | | | | | | *pLineEnd = tmp; | | | | | | return false; | | } | | | | | | int TrieFillDictionary(PTrieNode rootPTrieNode, char* inFile) | | { | | | | trie_low_offset = 0x00; | | trie_high_offset = 0xFF; | | trie_branch_size = trie_high_offset - trie_low_offset + 1; | | | | | | trie_min_insert = 1; | | trie_max_insert = TRIE_DEEP_SIZE - 1; | | | | | | trie_min_granularity = 1; | | trie_max_granularity = 1; | | | | | | use_english_mode = false; | | use_substring_match = false; | | | | | | FILE* fp = fopen(inFile, "rb"); | | if(fp == NULL) | | { | | fprintf(stderr, "Can't read the dictionary \"%s\"\n", inFile); | | exit(1); | | } | | | | fseek(fp, 0, SEEK_SET); | | | | fseek(fp, 0, SEEK_END); | | | | int inFileLen = ftell(fp); | | | | fseek(fp, 0, SEEK_SET); | | | | | | const byte* inFileContainer = (byte*)malloc((inFileLen + 3) * sizeof(byte)); | | if(inFileContainer == NULL) | | { | | | | fprintf(stderr, "Can't have enough memory\n"); | | exit(1); | | } | | | | fread((void*)inFileContainer, (size_t)inFileLen, sizeof(byte), fp); | | | | fclose(fp); | | | | | | byte* pLine = (byte*)inFileContainer; | | | | pLine[inFileLen + 0] = 0x0D; | | pLine[inFileLen + 1] = 0x0A; | | pLine[inFileLen + 2] = BYTE_EOF; | | | | | | byte* p = pLine; | | | | | | int lineLen = 0; | | | | | | bool alreadyGetParameters = false; | | | | | | int insertWordsCount = 0; | | | | | | int getStartupParametersDeep = 0; | | | | | | while(*p != BYTE_EOF) | | { | | | | while((*p != BYTE_EOF) && (*p != 0x0D) && (*p != 0x0A)) | | { | | | | if( | | | | (alreadyGetParameters == true) | | && | | (!( | | | | (*p >= trie_low_offset) | | | | | | && (*p <= trie_high_offset) | | )) | | ) | | { | | while((*p != BYTE_EOF) && (*p != 0x0D) && (*p != 0x0A)) | | { | | p ++; | | } | | while((*p != BYTE_EOF) && ((*p == 0x0D) || (*p == 0x0A))) | | { | | p ++; | | } | | pLine = p; | | continue; | | } | | | | p ++; | | } | | | | | | lineLen = p - pLine; | | | | | | if( | | | | (alreadyGetParameters == true) | | | | | | && (trie_min_insert <= lineLen) | | && (lineLen <= trie_max_insert) | | | | | | && ((lineLen + 1) < TRIE_DEEP_SIZE) | | ) | | { | | | | TrieInsert(rootPTrieNode, (byte*)pLine, lineLen); | | insertWordsCount ++; | | } | | | | | | if(alreadyGetParameters == false) | | { | | if((++ getStartupParametersDeep) > 512) | | { | | | | fprintf(stderr, "Can't get startup parameters\n"); | | exit(1); | | } | | | | | | if(AlreadyGetStartupParameters(pLine, p) == true) | | { | | | | if( | | (! ( (0 <= trie_low_offset) && (trie_low_offset <= 255) ) ) | | || (! ( (0 <= trie_high_offset) && (trie_high_offset <= 255) ) ) | | || (trie_low_offset > trie_high_offset) | | || (trie_min_insert > trie_max_insert) | | || (trie_max_insert > TRIE_DEEP_SIZE) | | || (trie_min_granularity <= 0) | | || (trie_max_granularity <= 0) | | ) | | { | | | | fprintf(stderr, "Wrong startup trie database parameters\n"); | | exit(1); | | } | | | | | | trie_branch_size = (trie_high_offset - trie_low_offset) + 1; | | | | | | alreadyGetParameters = true; | | } | | } | | | | | | while((*p != BYTE_EOF) && ((*p == 0x0D) || (*p == 0x0A))) | | { | | p ++; | | } | | | | | | pLine = p; | | } | | | | | | free((byte*)inFileContainer); | | | | | | if(insertWordsCount == 0) | | { | | | | fprintf(stderr, "Do not insert any words\n"); | | exit(1); | | } | | | | return 0; | | } | | | | | | int TrieStaticDictionary(PTrieNode rootPTrieNode, FILE* fp) | | { | | if(fp == NULL) | | { | | | | fprintf(stderr, "Read stdin error\n"); | | exit(1); | | } | | | | | | fseek(fp, 0, SEEK_SET); | | | | | | RetTrieStatic retTrieStatic = {}, *pRetTrieStatic = &retTrieStatic; | | | | | | byte line[MAX_JOIN_SIZE + LINE_BUFF_SIZE + 3]; | | | | | | int offset = 0; | | | | | | int totalMatched = 0; | | | | | | bool isJoinMatchMode = false; | | | | | | while(! feof(fp)) | | { | | | | size_t readLen = fread((void*)(line + MAX_JOIN_SIZE), sizeof(byte), LINE_BUFF_SIZE, fp); | | | | | | *(line + MAX_JOIN_SIZE + readLen) = BYTE_EOF; | | | | | | if(readLen < LINE_BUFF_SIZE) | | { | | isJoinMatchMode = false; | | } | | | | | | TrieStatic(rootPTrieNode, (byte*)(line + MAX_JOIN_SIZE - offset), (size_t)(readLen + offset), use_substring_match, use_english_mode, isJoinMatchMode, pRetTrieStatic, trie_min_granularity, trie_max_granularity); | | | | | | offset = pRetTrieStatic->restBytesNumber; | | | | | | if(pRetTrieStatic->restMatchPtr != NULL) | | { | | | | memcpy((void*)((line + MAX_JOIN_SIZE) - offset), (const void*)pRetTrieStatic->restMatchPtr, (size_t)pRetTrieStatic->restBytesNumber); | | | | isJoinMatchMode = true; | | } | | else | | { | | | | isJoinMatchMode = false; | | } | | | | | | totalMatched += pRetTrieStatic->thisMatchedCount; | | } | | | | | | return totalMatched; | | } | | | | | | int PrintStaticHead() | | { | | | | time_t startTime; | | struct tm* timeinfo; | | | | | | time(&startTime); | | timeinfo=localtime(&startTime); | | fprintf | | ( | | stdout | | , | | "%s" | | "------------------------------------------\n" | | " Times Percent Words\n" | | "------------------------------------------\n" | | , | | asctime(timeinfo) | | ); | | return 0; | | } | | | | | | int main(int argc, char** argv) | | { | | | | if(argc != 2) | | { | | | | fputs(HELP_INFORMATION, stderr); | | exit(1); | | } | | | | | | if(strcasecmp(argv[1], "m") == 0) | | { | | | | fputs(DICFILE_HEAD, stdout); | | exit(0); | | } | | | | | | char* ptr = strchr(argv[1], ':'); | | if(ptr == NULL) | | { | | | | fputs(HELP_INFORMATION, stderr); | | exit(1); | | } | | | | | | *(ptr ++) = '\0'; | | | | | | SortMode sortMode = SORT_NO; | | | | | | if(strcasecmp(argv[1], "+u") == 0) | | { | | sortMode = SORT_ASC; | | } | | else if(strcasecmp(argv[1], "-u") == 0) | | { | | sortMode = SORT_DESC; | | } | | else if(strcasecmp(argv[1], "u") == 0) | | { | | sortMode = SORT_NO; | | } | | else | | { | | | | fputs(HELP_INFORMATION, stderr); | | exit(1); | | } | | | | | | PTrieNode rootPTrieNode = PTrieNodeCreate((int)(0xFF - 0x00 + 1)); | | | | | | TrieFillDictionary(rootPTrieNode, ptr); | | | | | | int totalMatchedNum = TrieStaticDictionary(rootPTrieNode, stdin); | | | | | | if(sortMode == SORT_NO) | | { | | | | PrintStaticHead(); | | | | | | TriePrint(rootPTrieNode, NULL, totalMatchedNum); | | | | | | TrieFree(rootPTrieNode); | | return 0; | | } | | else | | { | | | | const int* staticArray = (int*)malloc(SHOW_ARRAY_SIZE * 2 * sizeof(int)); | | | | p_static_array = (int*)staticArray; | | | | | | TriePrint(rootPTrieNode, staticArray, totalMatchedNum); | | | | | | TrieFree(rootPTrieNode); | | | | | | int staticKeyCount = (p_static_array - staticArray) / 2; | | if(staticKeyCount == 0) | | { | | | | fprintf(stderr, "Do not match any keywords\n"); | | exit(1); | | } | | | | | | if(totalMatchedNum == staticKeyCount) | | { | | | | } | | else | | { | | | | if(QsortStaticArray((int*)staticArray, sortMode, 0, (staticKeyCount - 1)) == false) | | { | | | | fprintf(stderr, "Qsort the result failed\n"); | | exit(1); | | } | | } | | | | | | PrintStaticHead(); | | | | | | int* pStaticArray = (int*)staticArray; | | | | | | while(pStaticArray < p_static_array) | | { | | | | int staticStrTimes = (int)(*(pStaticArray ++)); | | char* staticStr = (char*)(*(pStaticArray ++)); | | | | | | fprintf(stdout, "%8d %7.4f %s\n", staticStrTimes, ((staticStrTimes * 1.0 / totalMatchedNum) * 100.0), staticStr); | | | | | | if( | | (show_list_size > 0) | | | | | | && ((++ already_show_list) >= show_list_size) | | ) | | { | | exit(1); | | } | | | | | | if(staticStr != NULL) | | { | | free(staticStr); | | } | | } | | | | free((int*)staticArray); | | } | | | | return 0; | | }COPY |
.
.
最后再附上一个自己写的成语接龙机器人,名字叫小娜(可在安卓上用C4直接编译,为节约空间,成语表 需要自己补全方可使用。) | | | | | | | | | | | | | | | #include <stdio.h> | | #include <stdlib.h> | | #include <string.h> | | #include <stdbool.h> | | #include <time.h> | | | | | | | | | | | | #if defined(WIN32) || defined(_WIN32) || defined(__MINGW32__) || defined(_MSC_VER) | | #include <windows.h> | | #define IDSO_SLEEP(x) Sleep(x) | | #else | | #include <unistd.h> | | #define IDSO_SLEEP(x) usleep(x*1000) | | #endif | | | | | | #define IDSO_SHELL_DIFFICULTY 16 | | | | | | #define LINE_BUFF_SIZE 1024 | | | | | | #define INPUT_BUFF_SIZE 36 | | | | | | #define BUCKET_DEEP_SIZE 4 | | | | | | static char* idso_data[]; | | | | static char* idso_data[] = | | { | | NULL, | | "一丁不识", | | "一不扭众", | | "一世之雄", | | "一世龙门", | | "一丘一壑", | | "一丘之貉", | | "一丝一毫", | | "一丝不挂", | | "一丝不紊", | | "一丝不苟", | | "一丝两气", | | "一丝半粟", | | "一个半个", | | "一串骊珠", | | "一举一动", | | "一举万里", | | "一举三反", | | "一举两全", | | "一举两得", | | "一举千里", | | "一举成名", | | "一之为甚", | | "一之已甚", | | "一之谓甚", | | "一乱涂地", | | "一了百了", | | "一了百当", | | "一事不知", | | "一事无成", | | "一五一十", | | "一些半些", | | "一人之交", | | }; | | | | | | static const int idso_data_size = sizeof(idso_data) / sizeof(idso_data[0]); | | | | | | typedef enum | | { | | FIRST_UC = 1, | | LAST_UC = -1, | | WHOLE_UC = 0 | | } BdkrMode; | | | | | | typedef enum | | { | | SHELL_USER_EXIT = 0, | | SHELL_USER_SUCCEED = 1, | | SHELL_USER_FAILED = 2, | | SHELL_USER_RAND = 3 | | } RetIdso; | | | | | | int GetUcLen(unsigned char inUc) | | { | | if((inUc & 0x80) == 0x00) | | { | | return 1; | | } | | | | int ucLen = 0; | | while((inUc & 0x80) == 0x80) | | { | | ucLen ++; | | inUc <<= 1; | | } | | return ucLen; | | } | | | | | | #define UQSORT_STACK_SIZE 1024 | | #define UCS2UINT(x) (unsigned int)(((unsigned char)(x)[0]<<24)|((unsigned char)(x)[1]<<16)|((unsigned char)(x)[2]<<8)|((unsigned char)(x)[3])) | | #define SWAP(a,b) do{char* __tmpNumber=(a); (a)=(b), (b)=__tmpNumber;}while(0) | | | | | | int UcsQsort(char* ucStr[], int left, int right) | | { | | if( | | ucStr == NULL | | || left < 0 | | || right < 0 | | || left >= right | | ) | | { | | return 1; | | } | | | | | | int pStack[UQSORT_STACK_SIZE][2], pStackIndex = 0; | | | | | | pStack[pStackIndex ][0] = left; | | pStack[pStackIndex ++][1] = right; | | | | int i, j; | | while(pStackIndex > 0) | | { | | | | i = pStack[-- pStackIndex][0]; | | j = pStack[ pStackIndex][1]; | | | | if(i < j) | | { | | int ls = i, rs = j; | | char* baseNum = NULL; | | | | | | if(j-i+1 < 8) | | { | | int p, max; | | while (ls < rs) | | { | | max = ls; | | for (p = ls+1; p <= rs; p ++) | | { | | if (UCS2UINT(ucStr[p]) > UCS2UINT(ucStr[max])) | | { | | max = p; | | } | | } | | SWAP(ucStr[max], ucStr[rs]); | | rs--; | | } | | | | continue; | | } | | | | | | int mid = (j-i)/2 + i + 1; | | | | | | if(UCS2UINT(ucStr[i]) > UCS2UINT(ucStr[mid])) | | { | | SWAP(ucStr[i], ucStr[mid]); | | } | | if(UCS2UINT(ucStr[i]) > UCS2UINT(ucStr[j])) | | { | | SWAP(ucStr[i], ucStr[j]); | | } | | if(UCS2UINT(ucStr[mid]) > UCS2UINT(ucStr[j])) | | { | | SWAP(ucStr[mid], ucStr[j]); | | } | | SWAP(ucStr[mid], ucStr[i]); | | | | | | ls = i, rs = j, baseNum = ucStr[ls]; | | while(ls<rs) | | { | | while(ls < rs && UCS2UINT(ucStr[rs]) >= UCS2UINT(baseNum)) | | { | | rs--; | | } | | ucStr[ls] = ucStr[rs]; | | | | while(ls < rs && UCS2UINT(ucStr[ls]) <= UCS2UINT(baseNum)) | | { | | ls++; | | } | | ucStr[rs] = ucStr[ls]; | | } | | ucStr[ls] = baseNum; | | | | | | int k = ls; | | if(i < k) | | { | | | | pStack[pStackIndex ][0] = i; | | pStack[pStackIndex ++][1] = k-1; | | } | | | | if(k < j) | | { | | | | pStack[pStackIndex ][0] = k+1; | | pStack[pStackIndex ++][1] = j; | | } | | } | | } | | return 0; | | } | | | | | | int BKDRHash(char *inStr, BdkrMode inMode, int bkdrBase) | | { | | | | if(inStr == NULL) | | { | | return 0; | | } | | | | int inStrLen = -1; | | if(inMode == FIRST_UC) | | { | | inStrLen = GetUcLen((unsigned char)*inStr); | | } | | else if(inMode == LAST_UC) | | { | | inStrLen = 1; | | inStr += (int)(strlen(inStr) - 1); | | while(((*inStr) & 0xC0) == 0x80) | | { | | inStrLen ++; | | inStr --; | | } | | } | | | | unsigned int seed = 131; | | unsigned int hash = 0; | | | | char* str = inStr; | | while ((str - inStr < inStrLen) || ((inMode == WHOLE_UC) && (*str))) | | { | | hash = hash * seed + (*(str ++)); | | } | | return (int)(hash % bkdrBase); | | } | | | | | | void* GetFirstUCharacterHashTable(char** idsoData, int idsoDataSize, int hashBase) | | { | | | | if(hashBase < idsoDataSize) | | { | | | | } | | | | | | int (*firstUcHashTable)[2] = (int(*)[2])calloc(hashBase * 2, sizeof(int)); | | | | | | int i; | | for(i = 1; i < idsoDataSize; i++) | | { | | | | int firstUcHashIndex = BKDRHash((char*)idsoData[i], FIRST_UC, hashBase); | | | | | | if(firstUcHashTable[firstUcHashIndex][0] == 0) | | { | | firstUcHashTable[firstUcHashIndex][0] = i; | | firstUcHashTable[firstUcHashIndex][1] ++; | | continue; | | } | | | | | | int ucLen = GetUcLen((unsigned char)(*(idsoData[i]))); | | | | | | if(strncmp((char*)idsoData[i], (char*)idsoData[firstUcHashTable[firstUcHashIndex][0]], ucLen) == 0) | | { | | | | firstUcHashTable[firstUcHashIndex][1] ++; | | } | | else | | { | | | | free(firstUcHashTable); | | | | | | return NULL; | | } | | } | | | | | | return (void*)firstUcHashTable; | | } | | | | | | void* GetWholeUCharacterHashTable(char** idsoData, int idsoDataSize, int hashBase) | | { | | | | if(hashBase < idsoDataSize) | | { | | | | } | | | | | | int bucketNumber = (hashBase / BUCKET_DEEP_SIZE) * (64 / BUCKET_DEEP_SIZE); | | | | | | int (*wholeUcHashTable)[BUCKET_DEEP_SIZE] = (int(*)[BUCKET_DEEP_SIZE])calloc(bucketNumber * BUCKET_DEEP_SIZE, sizeof(int)); | | | | | | int i; | | for(i = 1; i< idsoDataSize; i++) | | { | | | | int wholeUcHashIndex = BKDRHash((char*)idsoData[i], WHOLE_UC, bucketNumber); | | | | | | int j; | | for(j = 0; ; j++) | | { | | | | if(j >= BUCKET_DEEP_SIZE) | | { | | | | free(wholeUcHashTable); | | return NULL; | | } | | | | | | if(wholeUcHashTable[wholeUcHashIndex][j] == 0) | | { | | wholeUcHashTable[wholeUcHashIndex][j] = i; | | break; | | } | | } | | } | | | | | | return (void*)wholeUcHashTable; | | } | | | | | | int GetHelpInformation() | | { | | fprintf | | ( | | stdout | | , | | "[欢迎你使用 \"成语接龙\"] --- 输入Q退出\n" | | "[小娜]: 我是萌萌的小娜!\n" | | ); | | return 0; | | } | | | | | | RetIdso IdsoShellCore(int firstUcHashBase, int bucketNumber, int (*firstUcHashTable)[2], int (*wholeUcHashTable)[BUCKET_DEEP_SIZE]) | | { | | | | char* needUcStr = (char*)idso_data[rand()% (idso_data_size-1) + 1]; | | | | | | int needUcStrLastUcHash = BKDRHash(needUcStr, LAST_UC, firstUcHashBase); | | | | | | if((firstUcHashTable[needUcStrLastUcHash][0] == 0) || (firstUcHashTable[needUcStrLastUcHash][1] <= IDSO_SHELL_DIFFICULTY)) | | { | | return SHELL_USER_RAND; | | } | | | | | | char inPut[INPUT_BUFF_SIZE + 1]; | | | | | | char* sArgv[3]; | | int sArgc; | | | | | | while(true) | | { | | | | fprintf | | ( | | stdout | | , | | "[小娜]: %s\n" | | "[>>>>]: " | | , | | needUcStr | | ); | | | | | | fgets(inPut, INPUT_BUFF_SIZE, stdin); | | | | | | char* p =strchr(inPut, '\n'); | | | | | | if(p == inPut) | | { | | GetHelpInformation(); | | continue; | | } | | | | | | if(p != NULL) | | { | | *p='\0'; | | } | | | | | | if(! strcasecmp(inPut, "Q")) | | { | | fprintf(stdout, "[感谢使用,再见!]"); | | return SHELL_USER_EXIT; | | } | | | | | | sArgc = 0; | | p = inPut; | | while(*p) | | { | | while(*p == ' '|| *p == '\t') | | { | | *(p ++) = 0; | | } | | | | if(*p) | | { | | if(++ sArgc > 3) | | { | | break; | | } | | sArgv[sArgc - 1] = p; | | } | | | | while(*p && (*p != ' ') && (*p != '\t')) | | { | | p += GetUcLen((unsigned char)*p); | | } | | } | | | | | | if(sArgc == 1) | | { | | bool isCorrect = false; | | | | | | int inPutWholeUcHash = BKDRHash(sArgv[0], WHOLE_UC, bucketNumber); | | | | | | int j; | | for(j=0; (j<BUCKET_DEEP_SIZE) && (wholeUcHashTable[inPutWholeUcHash][j] != 0); j++) | | { | | char* pUcStr = (char*)idso_data[wholeUcHashTable[inPutWholeUcHash][j]]; | | | | | | if(! strcmp(sArgv[0], (char*)pUcStr)) | | { | | | | int inPutFirstUcHash = BKDRHash((char*)pUcStr, FIRST_UC, firstUcHashBase); | | | | | | if(inPutFirstUcHash != needUcStrLastUcHash) | | { | | | | break; | | } | | | | | | int inPutLastUcHash = BKDRHash((char*)pUcStr, LAST_UC, firstUcHashBase); | | | | | | if(firstUcHashTable[inPutLastUcHash][0] != 0) | | { | | | | int randIndex = rand() % firstUcHashTable[inPutLastUcHash][1]; | | char* rUcStr = (char*)idso_data[firstUcHashTable[inPutLastUcHash][0] + randIndex]; | | | | | | int rUcStrLastUcHash = BKDRHash(rUcStr, LAST_UC, firstUcHashBase); | | | | | | if(firstUcHashTable[rUcStrLastUcHash][0] == 0) | | { | | | | fprintf(stdout, "[绝杀]: %s\n", rUcStr); | | | | fprintf(stdout, "[你输了]\n"); | | return SHELL_USER_FAILED; | | } | | | | | | needUcStr = rUcStr; | | | | | | needUcStrLastUcHash = rUcStrLastUcHash; | | } | | else | | { | | | | return SHELL_USER_SUCCEED; | | } | | | | | | isCorrect = true; | | break; | | } | | } | | | | | | if(! isCorrect) | | { | | | | int pickUpCount = firstUcHashTable[needUcStrLastUcHash][1]; | | | | | | if(pickUpCount <= IDSO_SHELL_DIFFICULTY) | | { | | | | char* promptUcStr = (char*)idso_data[firstUcHashTable[needUcStrLastUcHash][0] + rand() % pickUpCount]; | | fprintf(stdout, "[小娜悄悄提示 \"%s\"]\n", promptUcStr); | | } | | else | | { | | fprintf(stdout, "[小娜希望你能认真考虑!]\n"); | | } | | } | | } | | else | | { | | | | GetHelpInformation(); | | } | | } | | } | | | | | | int main(int argc, char *argv[]) | | { | | | | UcsQsort((char**)idso_data, 1, idso_data_size-1); | | | | int firstUcHashBase = 25511, wholeUcHashBase = 18311; | | void *pFc = NULL, *pHc = NULL; | | | | | | while((pFc = GetFirstUCharacterHashTable((char**)idso_data, idso_data_size, (++ firstUcHashBase))) == NULL); | | int (*firstUcHashTable)[2] = (int(*)[2])pFc; | | | | | | while((pHc = GetWholeUCharacterHashTable((char**)idso_data, idso_data_size, (++ wholeUcHashBase))) == NULL); | | int (*wholeUcHashTable)[BUCKET_DEEP_SIZE] = (int(*)[BUCKET_DEEP_SIZE])pHc; | | | | | | | | int bucketNumber = (wholeUcHashBase / BUCKET_DEEP_SIZE) * (64 / BUCKET_DEEP_SIZE); | | | | | | srand((unsigned)time(NULL)); | | | | | | GetHelpInformation(); | | | | | | while(true) | | { | | | | RetIdso retIdso = IdsoShellCore(firstUcHashBase, bucketNumber, firstUcHashTable, wholeUcHashTable); | | | | | | if(retIdso == SHELL_USER_EXIT) | | { | | | | break; | | } | | else if(retIdso == SHELL_USER_SUCCEED) | | { | | | | fprintf | | ( | | stdout | | , | | "[欢迎你使用 \"成语接龙\"] --- 输入Q退出\n" | | "[小娜]: 我是萌萌的小娜!\n" | | ); | | } | | else if(retIdso == SHELL_USER_FAILED) | | { | | | | fprintf | | ( | | stdout | | , | | "[欢迎你使用 \"成语接龙\"] --- 输入Q退出\n" | | "[小娜]: 我是萌萌的小娜!\n" | | ); | | } | | else if(retIdso == SHELL_USER_RAND) | | { | | | | IDSO_SLEEP(1); | | } | | } | | | | | | free(firstUcHashTable); | | free(wholeUcHashTable); | | return 0; | | }COPY |
|