批处理之家 - Powered by Discuz! Board

/*
CONSOLE TXT CHARACTER STATISTICS TOOL @COPYRIGHT@2017~2019 BY HAPPY, VERSION 1.0
WED OCT 25 2017 21:10:16 GMT+0800
**********************************************************************
gcc trie.c -o trie.exe REM For Windows
gcc trie.c -o trie REM For Linux
gcc trie.c -o trie REM For Android
cl trie.cpp /MD /Ox /out:trie.exe REM For VS
**********************************************************************
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
// 定义帮助说明
#define HELP_INFORMATION "\
Trie v1.0 - Console txt character statistics tool\n\
Usage:\n\
trie [option] <[infile]>[outfile]\n\
\n\
General options:\n\
u:[userdic] Not sorting statistics result\n\
+u:[userdic] Sorting result order by ASC\n\
-u:[userdic] Sorting result order by DESC\n\
\n\
You can use switch 'm' to make a user dictionary head file\n\
"
// 定义字典注释头
#define DICFILE_HEAD "\
/|||\n\
TRIE - USER DICTIONARY TABLE 1.0\n\
==============================================================\n\
DICTIONARY NAME: EMPTY\n\
MADE DATE: 1970-01-01\n\
CHARSET: ANSI/UTF8\n\
MAX SHOW:\n\
==============================================================\n\
BYTE LOW: 0\n\
BYTE HIGH: 255\n\
INSERT MIN: 1\n\
INSERT MAX: 8\n\
STEP FORWARD MIN: 1\n\
STEP FORWARD MAX: 2\n\
==============================================================\n\
|||/\n\
"
#ifndef byte
typedef unsigned char byte;
#endif
#ifndef bool
typedef enum {false, true} bool;
#endif
// 比特数组结尾
#define BYTE_EOF 0xFF
// 标准行长
#define LINE_BUFF_SIZE (1024 * 64)
// 拼接缓存区最大长度
#define MAX_JOIN_SIZE 32
// 展示数组词条容量（不超过64万条）
#define SHOW_ARRAY_SIZE (1024 * 10 * 64)
// Trie树深度，即字典词目最大字节数
#define TRIE_DEEP_SIZE 24
// Trie树价值数组长度
#define TRIE_VALUE_SIZE 1
// Trie树启动参数
static int trie_low_offset;
static int trie_high_offset;
static int trie_min_granularity;
static int trie_max_granularity;
static int trie_min_insert;
static int trie_max_insert;
static int use_english_mode;
static int use_substring_match;
// Trie树分支宽度
static int trie_branch_size;
// 统计表数组辅助指针
static int* p_static_array;
// 展示列表数目
static int show_list_size;
// 已经展示的数目
static int already_show_list;
// 判断是否字母的宏
#define ISLETTER(x) ( ((0x41 <= (x)) && ((x) <= 0x5A)) || ((0x61 <= (x)) && ((x) <= 0x7A)) )
// 判断是否小写字母的宏
#define ISLOWERLETTER(x) ((0x61 <= (x)) && ((x) <= 0x7A))
// 判断是否大写字母的宏
#define ISUPPERLETTER(x) ((0x41 <= (x)) && ((x) <= 0x5A))
// 取最值
#define MAXVALUE(a,b) (((a)>(b))?(a):(b))
// 排序类型
typedef enum
{
SORT_NO = 0,
SORT_ASC = 1,
SORT_DESC = -1
} SortMode;
// 快排堆栈尺寸
#define QSORT_STACK_SIZE 1024
// 快排数据交换
#define SWAP(a,b) (((a)==(b))?0:((a)^=(b)^=(a)^=(b)))
// #define SWAP(a,b) do{int _tmp=(a);(a)=(b),(b)=_tmp;}while(false)
// 快排比较函数
#define COMP(a,b,sortMode) (((a)-(b))*(sortMode))
// 增强型快排算法
bool QsortStaticArray(int* staticArray, SortMode sortMode, int left, int right)
{
// 针对仅排序一个元素时，无需排序，直接返回真
if(right == left)
{
return true;
}
// 检验输入参数合法性
if(
staticArray == NULL
|| left < 0
|| right < 0
|| left > right
)
{
return false;
}
// 数据类型转化
int (*iArray)[2] = (int(*)[2])staticArray;
// 定义数组堆栈，用来记录
int pStack[QSORT_STACK_SIZE][2], pStackIndex = 0;
// 快排分区首次出栈
pStack[pStackIndex ][0] = left;
pStack[pStackIndex ++][1] = right;
// 混排算法核心
while(pStackIndex > 0)
{
// 快排分区出栈
int i = pStack[-- pStackIndex][0];
int j = pStack[ pStackIndex][1];
if(i < j)
{
int ls = i, rs = j;
// 对于小数组，直接采用选择排序
if(j - i + 1 < 8)
{
int p, max;
while (ls < rs)
{
max = ls;
for (p = ls + 1; p <= rs; p ++)
{
if(COMP(iArray[p][0], iArray[max][0], sortMode) > 0)
{
max = p;
}
}
SWAP(iArray[max][0], iArray[rs][0]);
SWAP(iArray[max][1], iArray[rs][1]);
rs--;
}
continue;
}
// 计算中间项数
int mid = (j-i)/2 + i + 1;
// 整理局部数组，以使中位数居中
if(COMP(iArray[i][0], iArray[mid][0], sortMode) > 0)
{
SWAP(iArray[i][0], iArray[mid][0]);
SWAP(iArray[i][1], iArray[mid][1]);
}
if(COMP(iArray[i][0], iArray[j][0], sortMode) > 0)
{
SWAP(iArray[i][0], iArray[j][0]);
SWAP(iArray[i][1], iArray[j][1]);
}
if(COMP(iArray[mid][0], iArray[j][0], sortMode) > 0)
{
SWAP(iArray[mid][0], iArray[j][0]);
SWAP(iArray[mid][1], iArray[j][1]);
}
SWAP(iArray[mid][0], iArray[i][0]);
SWAP(iArray[mid][1], iArray[i][1]);
// 获取基准数据
int base[2] = {iArray[ls][0], iArray[ls][1]};
bool needsCheck = false;
bool alreadyCheck = false;
// 快排交换核心
while(ls < rs)
{
while(ls < rs && COMP(iArray[rs][0], base[0], sortMode) >= 0)
{
rs--;
}
iArray[ls][0] = iArray[rs][0];
iArray[ls][1] = iArray[rs][1];
// 右排序检查器
if(alreadyCheck == false)
{
if(rs == ls)
{
needsCheck = true;
}
}
while(ls < rs && COMP(iArray[ls][0], base[0], sortMode) <= 0)
{
ls++;
}
iArray[rs][0] = iArray[ls][0];
iArray[rs][1] = iArray[ls][1];
// 左排序检查器
if(alreadyCheck == false)
{
if(rs == ls)
{
needsCheck = true;
}
}
// 已经检查完毕
alreadyCheck = true;
}
iArray[ls][0] = base[0];
iArray[ls][1] = base[1];
// 如果需要检查排序
if(needsCheck == true)
{
int check = i;
while(check < j && COMP(iArray[check][0], iArray[check + 1][0], sortMode) <= 0)
{
check ++;
}
if(check == j)
{
// 已经排序，直接跳转
continue;
}
}
// 快排分区数组堆栈实现
int k = ls;
if(i < k - 1)
{
// 快排分区入栈
pStack[pStackIndex ][0] = i;
pStack[pStackIndex ++][1] = k-1;
}
if(k +1 < j)
{
// 快排分区入栈
pStack[pStackIndex ][0] = k+1;
pStack[pStackIndex ++][1] = j;
}
}
}
return true;
}
// 创建TrieNode结构体
typedef struct TrieNode
{
// 节点词尾
bool end;
// 节点数据（value[]数组用于词频统计）
int value[TRIE_VALUE_SIZE];
// 节点分支
struct TrieNode** branch;
} TrieNode, *PTrieNode;
// 创建TrieNode节点
PTrieNode PTrieNodeCreate(int branchSize)
{
// 动态分配TrieNode内存
PTrieNode pTrieNode = (PTrieNode)malloc(1 * sizeof(TrieNode));
// 分配内存识别
if(pTrieNode == NULL)
{
// 如果开辟内存失败，直接退出
fprintf(stderr, "Can't have enough memory\n");
exit(1);
}
// 词目结尾标记
pTrieNode->end = false;
// 统计信息记录点
memset((void*)pTrieNode->value, (int)NULL, TRIE_VALUE_SIZE * sizeof(int));
// 分配Trie树节点内存
pTrieNode->branch = (TrieNode**)calloc(branchSize, sizeof(TrieNode*));
// 如果开辟内存失败，直接退出
if(pTrieNode->branch == NULL)
{
// 如果开辟内存失败，直接退出
fprintf(stderr, "Can't have enough memory\n");
exit(1);
}
// 分配内存成功，则返回节点指针
return pTrieNode;
}
// 释放Trie树内存
void TrieFree(PTrieNode rootPTrieNode)
{
PTrieNode pTrie = rootPTrieNode;
int i;
for(i = 0; i < trie_branch_size; i ++)
{
if(pTrie->branch[i] != NULL)
{
TrieFree(pTrie->branch[i]);
}
}
free(pTrie);
}
// Trie树插入函数
int TrieInsert(PTrieNode rootPTrieNode, byte* insertData, size_t insertDataLen)
{
// 当插入数据过深时直接拒绝
if(insertDataLen >= TRIE_DEEP_SIZE)
{
// 抛出错误
fprintf(stderr, "The dictionary tree is inserted too deep\n");
exit(1);
}
PTrieNode pTrie = rootPTrieNode;
byte* p = insertData;
while(p - insertData < insertDataLen)
{
if(pTrie->branch[*p - trie_low_offset] == NULL)
{
// 创建分支节点
pTrie->branch[*p - trie_low_offset] = PTrieNodeCreate(trie_branch_size);
}
// 深入下层节点
pTrie = pTrie->branch[*p - trie_low_offset];
// 插入词组
if(pTrie->end == false)
{
if((p + 1) - insertData == insertDataLen)
{
pTrie->end = true;
break;
}
}
// 指针移位
p ++;
}
return 0;
}
// 创建TrieStatic返回结构体
typedef struct RetTrieStatic
{
// 单次匹配的词目数
int thisMatchedCount;
// 剩余的字节数目
int restBytesNumber;
// 剩余的匹配位置
byte* restMatchPtr;
} RetTrieStatic, *PRetTrieStatic;
// Trie树统计函数
int TrieStatic(PTrieNode rootPTrieNode, byte* originalDataPtr, size_t originalDataLen, bool useSubstringMatch, bool useEnglishMode, bool isJoinMatch, PRetTrieStatic pRetTrieStatic, size_t minGranularity, size_t maxGranularity)
{
// Trie节点堆栈
PTrieNode p_trie_stack[TRIE_DEEP_SIZE + 1];
// 本次匹配的词目数
int mathedCount = 0;
// 余下的数据长度
int restDataLen = 0;
// 设置数据指针
byte* searchDataPtr = originalDataPtr;
// 循环切片匹配
while(*searchDataPtr != BYTE_EOF)
{
// 字符过滤器
while(*searchDataPtr != BYTE_EOF)
{
if(
// 判断字节范围
(! ((trie_low_offset <= *searchDataPtr) && (*searchDataPtr <= trie_high_offset)))
// 或者是英文模式下，非字母
|| ((useEnglishMode) && (! ISLETTER(*searchDataPtr)))
)
{
searchDataPtr ++;
}
else
{
// 符合字节范围的、则中断内层循环，送入外层循环
break;
}
}
// 辅助主指针
PTrieNode pTrie = rootPTrieNode;
byte* p = searchDataPtr;
// 堆栈索引
int p_trie_index = 0;
// 声明匹配参量
int matchLen = 0;
bool matchAlready = false;
// 匹配算法核心
while(*p != BYTE_EOF)
{
if(
// 字节过滤范围
(!(
// 字节筛选器低位
(*p >= trie_low_offset)
// 字节筛选器高位
&& (*p <= trie_high_offset)
))
// 匹配失败
|| (pTrie->branch[*p - trie_low_offset] == NULL)
)
{
// 如果是英文匹配模式，则会自动忽略大小写
if(
(useEnglishMode)
&& (pTrie->branch[*p - trie_low_offset] == NULL)
&& (ISLETTER(*p))
)
{
*p = ISUPPERLETTER(*p) ?(*p + 0x20) :(*p - 0x20);
int V = (int)(*p) - trie_low_offset;
// 忽略大小写匹配英文单词
if(! ((0 <= V) && (V < trie_branch_size) && (pTrie->branch[V] != NULL)))
{
break;
}
}
else
{
break;
}
}
// 深入下层节点
pTrie = pTrie->branch[*p - trie_low_offset];
// 下层节点入栈
p_trie_stack[++ p_trie_index] = pTrie;
// 词频统计器
if(pTrie->end == true)
{
// 如果英文匹配模式
if(useEnglishMode)
{
if(! ISLETTER(*(p+1)))
{
// 标记为已匹配
matchAlready = true;
// 计算匹配的长度
matchLen = p_trie_index;
pTrie->value[0] ++;
break;
}
}
else
{
pTrie->value[0] ++;
}
}
// 指针移位
p ++;
}
// 扫尾处理
while((p_trie_index > 0) && (! useEnglishMode))
{
// 正常匹配模式
if((p_trie_stack[p_trie_index])->end == true)
{
// 词条统计消重
if(
// 并且已经匹配过
(matchAlready)
// 并且统计数目大于0
&& ((p_trie_stack[p_trie_index]->value)[0] > 0)
)
{
// 之前匹配的子串去重
(p_trie_stack[p_trie_index]->value)[0] --;
}
// 计算匹配到的词组长度
if(! matchAlready)
{
// 标记为已匹配
matchAlready = true;
// 计算匹配到的词组长度
matchLen = p_trie_index;
// 是否启用子串统计
if(! useSubstringMatch)
{
break;
}
}
}
p_trie_index --;
}
// 根据匹配长度去结算下一数据点位
if(matchAlready)
{
// 匹配词目计数器
mathedCount ++;
// 数据指针推进
searchDataPtr += matchLen;
}
else
{
if(! useEnglishMode)
{
// 数据指针粒度推进
searchDataPtr += ((*p) < 0x80) ?minGranularity :maxGranularity;
}
else
{
// 当为英文匹配模式时，先略过匹配失败的连续字母
while((*searchDataPtr != BYTE_EOF) && (ISLETTER(*searchDataPtr)))
{
searchDataPtr ++;
}
}
}
// 是否进入拼接匹配模式
if(isJoinMatch)
{
// 计算剩余数据长度
if(*searchDataPtr != BYTE_EOF)
{
restDataLen = originalDataLen - (searchDataPtr - originalDataPtr);
}
else
{
restDataLen = 0;
}
// 如果余下的数据过短
if(restDataLen + 1 < MAX_JOIN_SIZE)
{
// 直接跳出
break;
}
}
}
// 装载返回值结构体
pRetTrieStatic->thisMatchedCount = (int)mathedCount;
pRetTrieStatic->restBytesNumber = (int)restDataLen;
pRetTrieStatic->restMatchPtr = (searchDataPtr == originalDataPtr) ?NULL :(byte*)searchDataPtr;
return 0;
}
// 打印Trie统计结果
static byte branch_stack[TRIE_DEEP_SIZE + 1] = {};
static int branch_stack_index = -1;
// Trie打印
int TriePrint(PTrieNode rootPTrieNode, const int* inArray, int totalMatchedNum)
{
// 入栈
branch_stack_index ++;
// 辅助结构体指针
PTrieNode pTrie = rootPTrieNode;
int i;
for(i = 0; i < trie_branch_size; i ++)
{
if(pTrie->branch[i] != NULL)
{
branch_stack[branch_stack_index] = (char)(i + (int)trie_low_offset);
if((pTrie->branch[i])->end == true)
{
if((pTrie->branch[i])->value[0] > 0)
{
// 置结词条结束符
branch_stack[branch_stack_index + 1] = 0x00;
// 如果是非排序打印
if(inArray == NULL)
{
// 打印统计列表
fprintf(stdout, "%8d %7.4f %s\n", (pTrie->branch[i])->value[0], (((pTrie->branch[i])->value[0] * 1.0 / totalMatchedNum) * 100.0), (char*)branch_stack);
// 列表显示计数器
if(
(show_list_size > 0)
// 超过设定范围，直接退出
&& ((++ already_show_list) >= show_list_size)
)
{
exit(0);
}
}
// 如果是排序打印
else
{
// 分配展示字串内存
byte* pShowStr = (byte*)malloc((branch_stack_index + 2) * sizeof(byte));
if(pShowStr == NULL)
{
// 无法分配内存
fprintf(stderr, "Can't have enough memory\n");
exit(1);
}
// 复制展示字串
memcpy((void*)pShowStr, (const void*)branch_stack, (size_t)(branch_stack_index + 2));
// 将展示字串地址存入staticArray展示数组
*(p_static_array ++) = (int)((pTrie->branch[i])->value[0]);
*(p_static_array ++) = (int)pShowStr;
// 防御数组越界
if(((int)(p_static_array - inArray))/2 + 8 >= SHOW_ARRAY_SIZE)
{
// 温数组越界，直接退出
fprintf(stderr, "Array out of bounds\n");
exit(1);
}
}
}
}
// 递归打印
TriePrint(pTrie->branch[i], inArray, totalMatchedNum);
}
}
// 出栈
branch_stack_index --;
return 0;
}
// 字典注释键值表
static const byte* dictionary_annotation_key[][2] =
{
{(byte*)((int*)&show_list_size), (byte*)"MAX SHOW:"},
{(byte*)((int*)&trie_low_offset), (byte*)"BYTE LOW:"},
{(byte*)((int*)&trie_high_offset), (byte*)"BYTE HIGH:"},
{(byte*)((int*)&trie_min_insert), (byte*)"INSERT MIN:"},
{(byte*)((int*)&trie_max_insert), (byte*)"INSERT MAX:"},
{(byte*)((int*)&trie_min_granularity), (byte*)"STEP FORWARD MIN:"},
{(byte*)((int*)&trie_max_granularity), (byte*)"STEP FORWARD MAX:"},
{(byte*)((int*)&use_english_mode), (byte*)"USE ENGLISH MODE:"},
{(byte*)((int*)&use_substring_match), (byte*)"USE SUBMATCH MODE:"},
{NULL, (byte*)"|||/"},
{NULL, (byte*)NULL}
};
// 获取字典启动参数函数
bool AlreadyGetStartupParameters(byte* pLineHead, byte* pLineEnd)
{
// 空行返回假
if(pLineHead == NULL || pLineEnd == NULL || pLineHead == pLineEnd)
{
return false;
}
// 临时变量（先存下结尾的字节）
byte tmp = *pLineEnd;
// 置字符串结束符
*pLineEnd = 0x00;
// 三级指针
byte*** ptr = (byte***)dictionary_annotation_key;
// 计算启动参数数组长度
while((*(ptr + 1)) != NULL)
{
// 利用指针实现memcasecmp功能
byte *lp = (byte*)pLineHead, *rp = (byte*)(*(ptr + 1));
while((*lp != 0x00) && (*rp != 0x00))
{
if(
(ISUPPERLETTER(*lp) ?(*lp + 0x20) :*lp) != (ISUPPERLETTER(*rp) ?(*rp + 0x20) :*rp)
)
{
break;
}
lp ++, rp ++;
}
// 如果右指针滑动到结尾，说明匹配成功
if(*rp == 0x00)
{
// 判断是否为字典注释结束符串“|||/”
if((*(ptr + 0)) == NULL)
{
// 还原原始数据
*pLineEnd = tmp;
// 获取参数已经结束
return true;
}
// 提取启动参数
int thisParameters = atoi((char*)((byte*)pLineHead + ((byte*)rp - (byte*)(*(ptr + 1)))));
// 赋值给对应键名变量
*((int*)(*(ptr + 0))) = (int)thisParameters;
// 匹配了就跳出循环
break;
}
// 三级指针推进
ptr += sizeof(*dictionary_annotation_key) / sizeof(**dictionary_annotation_key);
}
// 还原原始数据
*pLineEnd = tmp;
// 还未获取到字典注释结束符串“|||/”
return false;
}
// 填充字典树
int TrieFillDictionary(PTrieNode rootPTrieNode, char* inFile)
{
//初始化一些全局参数，offset取值0 ~ 255
trie_low_offset = 0x00;
trie_high_offset = 0xFF;
trie_branch_size = trie_high_offset - trie_low_offset + 1;
// 插入长度限制阈
trie_min_insert = 1;
trie_max_insert = TRIE_DEEP_SIZE - 1;
// 设置统计粒度
trie_min_granularity = 1;
trie_max_granularity = 1;
// 是否启用英文匹配模式
use_english_mode = false;
use_substring_match = false;
// 读取字典文件
FILE* fp = fopen(inFile, "rb");
if(fp == NULL)
{
fprintf(stderr, "Can't read the dictionary \"%s\"\n", inFile);
exit(1);
}
// 文件流回首（非常必要）
fseek(fp, 0, SEEK_SET);
// 文件流置末尾
fseek(fp, 0, SEEK_END);
// 测量db字典文件尺寸
int inFileLen = ftell(fp);
// 文件流回首
fseek(fp, 0, SEEK_SET);
// 开辟行存容器
const byte* inFileContainer = (byte*)malloc((inFileLen + 3) * sizeof(byte));
if(inFileContainer == NULL)
{
// 无法分配内存
fprintf(stderr, "Can't have enough memory\n");
exit(1);
}
// 读入内存
fread((void*)inFileContainer, (size_t)inFileLen, sizeof(byte), fp);
// 关闭文件流
fclose(fp);
// 设置数据行指针
byte* pLine = (byte*)inFileContainer;
// 行末置结束符（防止行末无换行）
pLine[inFileLen + 0] = 0x0D;
pLine[inFileLen + 1] = 0x0A;
pLine[inFileLen + 2] = BYTE_EOF;
// 辅助指针
byte* p = pLine;
// 当前小行的长度
int lineLen = 0;
// 是否已经获取启动参数
bool alreadyGetParameters = false;
// 插入的词组数目
int insertWordsCount = 0;
// 获取字典启动参数的步探深度
int getStartupParametersDeep = 0;
// 开启主循环
while(*p != BYTE_EOF)
{
// 过滤换行符
while((*p != BYTE_EOF) && (*p != 0x0D) && (*p != 0x0A))
{
// 字节筛选器
if(
// 如果已经获取到启动参数
(alreadyGetParameters == true)
&&
(!(
// 字节筛选器低位
(*p >= trie_low_offset)
// 字节筛选器高位
&& (*p <= trie_high_offset)
))
)
{
while((*p != BYTE_EOF) && (*p != 0x0D) && (*p != 0x0A))
{
p ++;
}
while((*p != BYTE_EOF) && ((*p == 0x0D) || (*p == 0x0A)))
{
p ++;
}
pLine = p;
continue;
}
p ++;
}
// 计算行长
lineLen = p - pLine;
// 当行长太短或太长，则抛弃
if(
// 如果已经获取到启动参数
(alreadyGetParameters == true)
// 插入词组长度限制
&& (trie_min_insert <= lineLen)
&& (lineLen <= trie_max_insert)
// 系统最大长度限制
&& ((lineLen + 1) < TRIE_DEEP_SIZE)
)
{
// 将该词行插入字典树
TrieInsert(rootPTrieNode, (byte*)pLine, lineLen);
insertWordsCount ++;
}
// 获取启动参数子程
if(alreadyGetParameters == false)
{
if((++ getStartupParametersDeep) > 512)
{
// 启动参数隐藏太深，无法获取
fprintf(stderr, "Can't get startup parameters\n");
exit(1);
}
// 当检测到注释结束符“|||/”时
if(AlreadyGetStartupParameters(pLine, p) == true)
{
// 检验已获取的参数的合法性
if(
(! ( (0 <= trie_low_offset) && (trie_low_offset <= 255) ) )
|| (! ( (0 <= trie_high_offset) && (trie_high_offset <= 255) ) )
|| (trie_low_offset > trie_high_offset)
|| (trie_min_insert > trie_max_insert)
|| (trie_max_insert > TRIE_DEEP_SIZE)
|| (trie_min_granularity <= 0)
|| (trie_max_granularity <= 0)
)
{
// 启动参数错误，直接退出
fprintf(stderr, "Wrong startup trie database parameters\n");
exit(1);
}
// 计算Trie节点分支数目（这是一个全局变量）
trie_branch_size = (trie_high_offset - trie_low_offset) + 1;
// 关闭获取参数开关，开始执行字典词条插入
alreadyGetParameters = true;
}
}
// 过滤换行符
while((*p != BYTE_EOF) && ((*p == 0x0D) || (*p == 0x0A)))
{
p ++;
}
// 指针行进
pLine = p;
}
// 释放行存容器
free((byte*)inFileContainer);
// 检测是否已插入词条
if(insertWordsCount == 0)
{
// 读取错误，直接退出
fprintf(stderr, "Do not insert any words\n");
exit(1);
}
return 0;
}
// 统计字典树
int TrieStaticDictionary(PTrieNode rootPTrieNode, FILE* fp)
{
if(fp == NULL)
{
// 读取错误，直接退出
fprintf(stderr, "Read stdin error\n");
exit(1);
}
// 指针流位重置（非常必要）
fseek(fp, 0, SEEK_SET);
// 初始化匹配返回结构体
RetTrieStatic retTrieStatic = {}, *pRetTrieStatic = &retTrieStatic;
// 余行拼接器：|余行区域|读行区域|末端|
byte line[MAX_JOIN_SIZE + LINE_BUFF_SIZE + 3];
// 调用TrieStatic函数时，line的偏移量
int offset = 0;
// 总匹配词目
int totalMatched = 0;
// 是否开启拼接匹配
bool isJoinMatchMode = false;
// 循环读取STDIN输入流，统计词频
while(! feof(fp))
{
// 每次实际读取的字节数
size_t readLen = fread((void*)(line + MAX_JOIN_SIZE), sizeof(byte), LINE_BUFF_SIZE, fp);
// 置byte结束符
*(line + MAX_JOIN_SIZE + readLen) = BYTE_EOF;
// 最后一次读取可以关闭拼接模式，因为readLen长度过小
if(readLen < LINE_BUFF_SIZE)
{
isJoinMatchMode = false;
}
// 在Trie中匹配数据块
TrieStatic(rootPTrieNode, (byte*)(line + MAX_JOIN_SIZE - offset), (size_t)(readLen + offset), use_substring_match, use_english_mode, isJoinMatchMode, pRetTrieStatic, trie_min_granularity, trie_max_granularity);
// 下次调用TrieStatic函数line的偏移值
offset = pRetTrieStatic->restBytesNumber;
// 是否开启拼接匹配
if(pRetTrieStatic->restMatchPtr != NULL)
{
// 拼接上次余下的数据
memcpy((void*)((line + MAX_JOIN_SIZE) - offset), (const void*)pRetTrieStatic->restMatchPtr, (size_t)pRetTrieStatic->restBytesNumber);
// 开启拼接匹配开关
isJoinMatchMode = true;
}
else
{
// 关闭拼接匹配开关
isJoinMatchMode = false;
}
// 总匹配词数计数器
totalMatched += pRetTrieStatic->thisMatchedCount;
}
// 返回匹配的词次
return totalMatched;
}
// 打印统计表头
int PrintStaticHead()
{
// 创建时间结构体
time_t startTime;
struct tm* timeinfo;
// 打印统计表头
time(&startTime);
timeinfo=localtime(&startTime);
fprintf
(
stdout
,
"%s"
"------------------------------------------\n"
" Times Percent Words\n"
"------------------------------------------\n"
,
asctime(timeinfo)
);
return 0;
}
// MAIN主函数入口
int main(int argc, char** argv)
{
// 参数数目不对
if(argc != 2)
{
// 第一次开关筛选
fputs(HELP_INFORMATION, stderr);
exit(1);
}
// 检测是否为-m开关
if(strcasecmp(argv[1], "m") == 0)
{
// 打印dicfile头
fputs(DICFILE_HEAD, stdout);
exit(0);
}
// 是否符合规则
char* ptr = strchr(argv[1], ':');
if(ptr == NULL)
{
// 第二次开关筛选
fputs(HELP_INFORMATION, stderr);
exit(1);
}
// 设结束符，同时后移指针
*(ptr ++) = '\0';
// 排序枚举类
SortMode sortMode = SORT_NO;
// 解析开关
if(strcasecmp(argv[1], "+u") == 0)
{
sortMode = SORT_ASC;
}
else if(strcasecmp(argv[1], "-u") == 0)
{
sortMode = SORT_DESC;
}
else if(strcasecmp(argv[1], "u") == 0)
{
sortMode = SORT_NO;
}
else
{
// 第三次开关筛选
fputs(HELP_INFORMATION, stderr);
exit(1);
}
// 创建Trie树根
PTrieNode rootPTrieNode = PTrieNodeCreate((int)(0xFF - 0x00 + 1));
// 获取启动参数，并填充Trie树
TrieFillDictionary(rootPTrieNode, ptr);
// 统计词条
int totalMatchedNum = TrieStaticDictionary(rootPTrieNode, stdin);
// 如果不排序，则直接输出
if(sortMode == SORT_NO)
{
// 打印统计表头
PrintStaticHead();
// 将词频统计印到staticArray数组中
TriePrint(rootPTrieNode, NULL, totalMatchedNum);
// 释放Trie树
TrieFree(rootPTrieNode);
return 0;
}
else
{
// 分配展示数组内存（占用不超过8M）
const int* staticArray = (int*)malloc(SHOW_ARRAY_SIZE * 2 * sizeof(int));
// 将全局指针置为展示数组首地址
p_static_array = (int*)staticArray;
// 将词频统计印到staticArray数组中
TriePrint(rootPTrieNode, staticArray, totalMatchedNum);
// 释放Trie树
TrieFree(rootPTrieNode);
// 计算统计到的不同词条数目
int staticKeyCount = (p_static_array - staticArray) / 2;
if(staticKeyCount == 0)
{
// 没有匹配到任何关键词目，直接退出
fprintf(stderr, "Do not match any keywords\n");
exit(1);
}
// 根据抽屉原理
if(totalMatchedNum == staticKeyCount)
{
// 如果没有重复，则不需要排序
}
else
{
// 对展示数组进行混合快排
if(QsortStaticArray((int*)staticArray, sortMode, 0, (staticKeyCount - 1)) == false)
{
// 如果排序失败，直接终止
fprintf(stderr, "Qsort the result failed\n");
exit(1);
}
}
// 打印统计表头
PrintStaticHead();
// 创建辅助指针，指向展示数组首地址
int* pStaticArray = (int*)staticArray;
// 打印排序后的统计表
while(pStaticArray < p_static_array)
{
// 转变数据类型
int staticStrTimes = (int)(*(pStaticArray ++));
char* staticStr = (char*)(*(pStaticArray ++));
// 打印统计列表
fprintf(stdout, "%8d %7.4f %s\n", staticStrTimes, ((staticStrTimes * 1.0 / totalMatchedNum) * 100.0), staticStr);
// 列表显示计数器
if(
(show_list_size > 0)
// 超过设定范围，直接退出
&& ((++ already_show_list) >= show_list_size)
)
{
exit(1);
}
// 释放展示字串空间
if(staticStr != NULL)
{
free(staticStr);
}
}
// 释放展示数组
free((int*)staticArray);
}
return 0;
}

/*
**************************************************************************
gcc ./idso.c -o idso
**************************************************************************
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <time.h>
// 引入头文件
//#include "idso.h"
// 跨平台宏函数
#if defined(WIN32) || defined(_WIN32) || defined(__MINGW32__) || defined(_MSC_VER)
#include <windows.h>
#define IDSO_SLEEP(x) Sleep(x)
#else
#include <unistd.h>
#define IDSO_SLEEP(x) usleep(x*1000)
#endif
// 定义成语接龙舒适度（满分100）
#define IDSO_SHELL_DIFFICULTY 16
// 标准行长
#define LINE_BUFF_SIZE 1024
// 用户输入字长
#define INPUT_BUFF_SIZE 36
// 定义hash桶深度
#define BUCKET_DEEP_SIZE 4
// 声明成语接龙数组（在idso.h文件中定义）
static char* idso_data[];
// 定义成语接龙数组
static char* idso_data[] =
{
NULL,
"一丁不识",
"一不扭众",
"一世之雄",
"一世龙门",
"一丘一壑",
"一丘之貉",
"一丝一毫",
"一丝不挂",
"一丝不紊",
"一丝不苟",
"一丝两气",
"一丝半粟",
"一个半个",
"一串骊珠",
"一举一动",
"一举万里",
"一举三反",
"一举两全",
"一举两得",
"一举千里",
"一举成名",
"一之为甚",
"一之已甚",
"一之谓甚",
"一乱涂地",
"一了百了",
"一了百当",
"一事不知",
"一事无成",
"一五一十",
"一些半些",
"一人之交",
};
// 计算接龙数组长度
static const int idso_data_size = sizeof(idso_data) / sizeof(idso_data[0]);
// 定义BDKR哈希模式枚举类型
typedef enum
{
FIRST_UC = 1, // 首字hash
LAST_UC = -1, // 末字hash
WHOLE_UC = 0 // 全字hash
} BdkrMode;
// 定义IdsoShellCore函数返回值枚举类型
typedef enum
{
SHELL_USER_EXIT = 0, // 退出SHELL
SHELL_USER_SUCCEED = 1, // 用户获胜
SHELL_USER_FAILED = 2, // 用户战败
SHELL_USER_RAND = 3 // 随机成语
} RetIdso;
// 获取UTF8汉字字节长度
int GetUcLen(unsigned char inUc)
{
if((inUc & 0x80) == 0x00)
{
return 1;
}
int ucLen = 0;
while((inUc & 0x80) == 0x80)
{
ucLen ++;
inUc <<= 1;
}
return ucLen;
}
// 快排辅助宏
#define UQSORT_STACK_SIZE 1024
#define UCS2UINT(x) (unsigned int)(((unsigned char)(x)[0]<<24)|((unsigned char)(x)[1]<<16)|((unsigned char)(x)[2]<<8)|((unsigned char)(x)[3]))
#define SWAP(a,b) do{char* __tmpNumber=(a); (a)=(b), (b)=__tmpNumber;}while(0)
// 混合型快排算法
int UcsQsort(char* ucStr[], int left, int right)
{
if(
ucStr == NULL
|| left < 0
|| right < 0
|| left >= right
)
{
return 1;
}
// 定义数组堆栈，用来记录
int pStack[UQSORT_STACK_SIZE][2], pStackIndex = 0;
// 快排分区首次出栈
pStack[pStackIndex ][0] = left;
pStack[pStackIndex ++][1] = right;
int i, j;
while(pStackIndex > 0)
{
// 快排分区出栈
i = pStack[-- pStackIndex][0];
j = pStack[ pStackIndex][1];
if(i < j)
{
int ls = i, rs = j;
char* baseNum = NULL;
// 对于小数组，直接采用选择排序
if(j-i+1 < 8)
{
int p, max;
while (ls < rs)
{
max = ls;
for (p = ls+1; p <= rs; p ++)
{
if (UCS2UINT(ucStr[p]) > UCS2UINT(ucStr[max]))
{
max = p;
}
}
SWAP(ucStr[max], ucStr[rs]);
rs--;
}
continue;
}
// 计算中间项数
int mid = (j-i)/2 + i + 1;
// 整理局部数组，以使中位数居中
if(UCS2UINT(ucStr[i]) > UCS2UINT(ucStr[mid]))
{
SWAP(ucStr[i], ucStr[mid]);
}
if(UCS2UINT(ucStr[i]) > UCS2UINT(ucStr[j]))
{
SWAP(ucStr[i], ucStr[j]);
}
if(UCS2UINT(ucStr[mid]) > UCS2UINT(ucStr[j]))
{
SWAP(ucStr[mid], ucStr[j]);
}
SWAP(ucStr[mid], ucStr[i]);
// 快排交换核心
ls = i, rs = j, baseNum = ucStr[ls];
while(ls<rs)
{
while(ls < rs && UCS2UINT(ucStr[rs]) >= UCS2UINT(baseNum))
{
rs--;
}
ucStr[ls] = ucStr[rs];
while(ls < rs && UCS2UINT(ucStr[ls]) <= UCS2UINT(baseNum))
{
ls++;
}
ucStr[rs] = ucStr[ls];
}
ucStr[ls] = baseNum;
// 快排分区数组堆栈实现
int k = ls;
if(i < k)
{
// 快排分区入栈
pStack[pStackIndex ][0] = i;
pStack[pStackIndex ++][1] = k-1;
}
if(k < j)
{
// 快排分区入栈
pStack[pStackIndex ][0] = k+1;
pStack[pStackIndex ++][1] = j;
}
}
}
return 0;
}
// BKDR哈希算法（当传递参数inStrLen为1时表示获取首字hash，为-1时表示获取尾字hash，其他数表示获取全字hash）
int BKDRHash(char *inStr, BdkrMode inMode, int bkdrBase)
{
// 空串返回直接零
if(inStr == NULL)
{
return 0;
}
int inStrLen = -1;
if(inMode == FIRST_UC)
{
inStrLen = GetUcLen((unsigned char)*inStr);
}
else if(inMode == LAST_UC)
{
inStrLen = 1;
inStr += (int)(strlen(inStr) - 1);
while(((*inStr) & 0xC0) == 0x80)
{
inStrLen ++;
inStr --;
}
}
unsigned int seed = 131;
unsigned int hash = 0;
char* str = inStr;
while ((str - inStr < inStrLen) || ((inMode == WHOLE_UC) && (*str)))
{
hash = hash * seed + (*(str ++));
}
return (int)(hash % bkdrBase);
}
// 获取成语首字索引hash表
void* GetFirstUCharacterHashTable(char** idsoData, int idsoDataSize, int hashBase)
{
// 当hash表基数小于成语数组长度时，返回空指针
if(hashBase < idsoDataSize)
{
// return NULL;
}
// 动态分配首字索引hash表内存
int (*firstUcHashTable)[2] = (int(*)[2])calloc(hashBase * 2, sizeof(int));
// 循环遍历每个成语，构建首字索引hash表
int i;
for(i = 1; i < idsoDataSize; i++)
{
// 计算首字hash值
int firstUcHashIndex = BKDRHash((char*)idsoData[i], FIRST_UC, hashBase);
// 构建首字索引hash表
if(firstUcHashTable[firstUcHashIndex][0] == 0)
{
firstUcHashTable[firstUcHashIndex][0] = i;
firstUcHashTable[firstUcHashIndex][1] ++;
continue;
}
// 获取UTF汉字字节长度
int ucLen = GetUcLen((unsigned char)(*(idsoData[i])));
// 比对成语首字是否重复
if(strncmp((char*)idsoData[i], (char*)idsoData[firstUcHashTable[firstUcHashIndex][0]], ucLen) == 0)
{
// 首字成语组计数器
firstUcHashTable[firstUcHashIndex][1] ++;
}
else
{
// 构建首字索引失败，释放hash表内存
free(firstUcHashTable);
// 返回空指针
return NULL;
}
}
// 返回首字索引hash表
return (void*)firstUcHashTable;
}
// 获取成语全字hash桶
void* GetWholeUCharacterHashTable(char** idsoData, int idsoDataSize, int hashBase)
{
// 当hash表基数小于成语数组长度时，返回空指针
if(hashBase < idsoDataSize)
{
// return NULL;
}
// 计算需要的hash桶数量
int bucketNumber = (hashBase / BUCKET_DEEP_SIZE) * (64 / BUCKET_DEEP_SIZE);
// 动态分配hash桶内存
int (*wholeUcHashTable)[BUCKET_DEEP_SIZE] = (int(*)[BUCKET_DEEP_SIZE])calloc(bucketNumber * BUCKET_DEEP_SIZE, sizeof(int));
// 循环遍历每个成语，构建全字hash桶
int i;
for(i = 1; i< idsoDataSize; i++)
{
// 计算全字hash值
int wholeUcHashIndex = BKDRHash((char*)idsoData[i], WHOLE_UC, bucketNumber);
// 循环桶深度
int j;
for(j = 0; ; j++)
{
// 超过桶深度、则返回空指针
if(j >= BUCKET_DEEP_SIZE)
{
// 构建hash桶失败，释放hash桶内存
free(wholeUcHashTable);
return NULL;
}
// 构建hash桶
if(wholeUcHashTable[wholeUcHashIndex][j] == 0)
{
wholeUcHashTable[wholeUcHashIndex][j] = i;
break;
}
}
}
// 返回首字索引hash表
return (void*)wholeUcHashTable;
}
// 帮助信息
int GetHelpInformation()
{
fprintf
(
stdout
,
"[欢迎你使用 \"成语接龙\"] --- 输入Q退出\n"
"[小娜]: 我是萌萌的小娜！\n"
);
return 0;
}
// 成语接龙SHELL
RetIdso IdsoShellCore(int firstUcHashBase, int bucketNumber, int (*firstUcHashTable)[2], int (*wholeUcHashTable)[BUCKET_DEEP_SIZE])
{
// 随机一个待接成语
char* needUcStr = (char*)idso_data[rand()% (idso_data_size-1) + 1];
// 计算待接成语末字hash
int needUcStrLastUcHash = BKDRHash(needUcStr, LAST_UC, firstUcHashBase);
// 如果待接成语不可接，则退出SHELL，休眠随机种子以待下次随机
if((firstUcHashTable[needUcStrLastUcHash][0] == 0) || (firstUcHashTable[needUcStrLastUcHash][1] <= IDSO_SHELL_DIFFICULTY))
{
return SHELL_USER_RAND;
}
// 接受输入字符串的容器
char inPut[INPUT_BUFF_SIZE + 1];
// 接受输入字符串转输入数组的容器
char* sArgv[3];
int sArgc;
// 接龙循环
while(true)
{
// 显示人机交互信息
fprintf
(
stdout
,
"[小娜]: %s\n"
"[>>>>]: "
,
needUcStr
);
// 接收用户输入流
fgets(inPut, INPUT_BUFF_SIZE, stdin);
// 找到第一个换行符的位置
char* p =strchr(inPut, '\n');
// 如果输入为空，则显示帮助信息
if(p == inPut)
{
GetHelpInformation();
continue;
}
// 置输入流结束符
if(p != NULL)
{
*p='\0';
}
// 使用按键 Q退出
if(! strcasecmp(inPut, "Q"))
{
fprintf(stdout, "[感谢使用，再见！]");
return SHELL_USER_EXIT;
}
// 切分输入字符流
sArgc = 0;
p = inPut;
while(*p)
{
while(*p == ' '|| *p == '\t')
{
*(p ++) = 0;
}
if(*p)
{
if(++ sArgc > 3)
{
break;
}
sArgv[sArgc - 1] = p;
}
while(*p && (*p != ' ') && (*p != '\t'))
{
p += GetUcLen((unsigned char)*p);
}
}
// 执行成语接龙核心
if(sArgc == 1)
{
bool isCorrect = false;
// 计算用户输入全字hash值
int inPutWholeUcHash = BKDRHash(sArgv[0], WHOLE_UC, bucketNumber);
// 遍历全字hash桶
int j;
for(j=0; (j<BUCKET_DEEP_SIZE) && (wholeUcHashTable[inPutWholeUcHash][j] != 0); j++)
{
char* pUcStr = (char*)idso_data[wholeUcHashTable[inPutWholeUcHash][j]];
// 比对桶中字符串
if(! strcmp(sArgv[0], (char*)pUcStr))
{
// 计算用户输入首字hash
int inPutFirstUcHash = BKDRHash((char*)pUcStr, FIRST_UC, firstUcHashBase);
// 比对首字hash，判断首字是否可接龙
if(inPutFirstUcHash != needUcStrLastUcHash)
{
// 用户首字没有接上、退出循环
break;
}
// 计算末字hash索引
int inPutLastUcHash = BKDRHash((char*)pUcStr, LAST_UC, firstUcHashBase);
// 依据末字hash 检索以其打头的成语
if(firstUcHashTable[inPutLastUcHash][0] != 0)
{
// 机器随机挑选一个以其字打头的成语
int randIndex = rand() % firstUcHashTable[inPutLastUcHash][1];
char* rUcStr = (char*)idso_data[firstUcHashTable[inPutLastUcHash][0] + randIndex];
// 计算此随机成语末字hash
int rUcStrLastUcHash = BKDRHash(rUcStr, LAST_UC, firstUcHashBase);
// 判断此随机成语，若末字不可接
if(firstUcHashTable[rUcStrLastUcHash][0] == 0)
{
// 显示绝杀成语
fprintf(stdout, "[绝杀]: %s\n", rUcStr);
fprintf(stdout, "[你输了]\n");
return SHELL_USER_FAILED;
}
// 末字可接，更新待接成语
needUcStr = rUcStr;
// 末字可接，更新待接首字hash
needUcStrLastUcHash = rUcStrLastUcHash;
}
else
{
// 机器无法接龙用户输入的成语，用户获胜
return SHELL_USER_SUCCEED;
}
// 回答正确，则标记为真
isCorrect = true;
break;
}
}
// 用户回答错误
if(! isCorrect)
{
// 计算可接成语数目
int pickUpCount = firstUcHashTable[needUcStrLastUcHash][1];
// 当接龙难度过高时，机器给予用户提示
if(pickUpCount <= IDSO_SHELL_DIFFICULTY)
{
// 机器随机一个答案
char* promptUcStr = (char*)idso_data[firstUcHashTable[needUcStrLastUcHash][0] + rand() % pickUpCount];
fprintf(stdout, "[小娜悄悄提示 \"%s\"]\n", promptUcStr);
}
else
{
fprintf(stdout, "[小娜希望你能认真考虑！]\n");
}
}
}
else
{
// 帮助信息
GetHelpInformation();
}
}
}
// 主函数
int main(int argc, char *argv[])
{
// 使用混合快排，先对成语数组进行前4字节索引排序
UcsQsort((char**)idso_data, 1, idso_data_size-1);
int firstUcHashBase = 25511, wholeUcHashBase = 18311;
void *pFc = NULL, *pHc = NULL;
// 建立首字索引hash表
while((pFc = GetFirstUCharacterHashTable((char**)idso_data, idso_data_size, (++ firstUcHashBase))) == NULL);
int (*firstUcHashTable)[2] = (int(*)[2])pFc;
// 建立全字hash桶
while((pHc = GetWholeUCharacterHashTable((char**)idso_data, idso_data_size, (++ wholeUcHashBase))) == NULL);
int (*wholeUcHashTable)[BUCKET_DEEP_SIZE] = (int(*)[BUCKET_DEEP_SIZE])pHc;
// 计算hash桶基数
int bucketNumber = (wholeUcHashBase / BUCKET_DEEP_SIZE) * (64 / BUCKET_DEEP_SIZE);
// 初始化随机种子
srand((unsigned)time(NULL));
// 打印SHELL题头
GetHelpInformation();
// 调用成语接龙SHELL核心
while(true)
{
// 获取接口SHELL返回值
RetIdso retIdso = IdsoShellCore(firstUcHashBase, bucketNumber, firstUcHashTable, wholeUcHashTable);
// 检测是否退出接龙SHELL
if(retIdso == SHELL_USER_EXIT)
{
// 退出接龙SHELL
break;
}
else if(retIdso == SHELL_USER_SUCCEED)
{
// 用户获胜 fprintf(stdout, "[你赢了]\n");
fprintf
(
stdout
,
"[欢迎你使用 \"成语接龙\"] --- 输入Q退出\n"
"[小娜]: 我是萌萌的小娜！\n"
);
}
else if(retIdso == SHELL_USER_FAILED)
{
// 用户战败
fprintf
(
stdout
,
"[欢迎你使用 \"成语接龙\"] --- 输入Q退出\n"
"[小娜]: 我是萌萌的小娜！\n"
);
}
else if(retIdso == SHELL_USER_RAND)
{
// 延缓时间，提高rand()成语随机性
IDSO_SLEEP(1);
}
}
// 释放内存
free(firstUcHashTable);
free(wholeUcHashTable);
return 0;
}