c语言实现英文文本词频统计

2025-06-11 02:04:28 C语言 3857

实现英文文本的词频统计可以通过C语言来完成，主要步骤包括读取文本文件、分割单词、统计单词出现的次数，并最终输出结果。以下是一个基本的实现示例：

实现步骤：

读取文本文件：使用标准C库中的文件操作函数，如fopen()打开文件，fscanf()或fgets()逐行读取文本内容。
分割单词：对读取的文本内容进行分割，可以使用字符串处理函数如strtok()或自定义函数来识别和提取单词。注意处理标点符号和空格。
统计单词出现次数：使用哈希表（或关联数组）来存储单词及其出现的次数。每当遇到一个新单词时，增加其计数；如果已经存在，则计数加一。
输出结果：最后遍历哈希表，输出每个单词及其出现次数。

示例代码：

c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAX_WORD_LEN 100

// 定义哈希表节点结构
struct WordNode {
    char word[MAX_WORD_LEN];
    int count;
    struct WordNode *next;
};

// 哈希表大小
#define HASH_SIZE 100

// 哈希表数组
struct WordNode *hashTable[HASH_SIZE];

// 哈希函数
unsigned int hashFunc(char *word) {
    unsigned int hash = 0;
    while (*word) {
        hash = hash * 31 + *word++;
    }
    return hash % HASH_SIZE;
}

// 插入单词到哈希表
void insertWord(char *word) {
    unsigned int index = hashFunc(word);
    struct WordNode *node = hashTable[index];

    // 查找单词是否已经在哈希表中
    while (node != NULL) {
        if (strcmp(node->word, word) == 0) {
            // 单词已存在，计数加一
            node->count++;
            return;
        }
        node = node->next;
    }

    // 如果单词不存在，创建新节点插入
    node = (struct WordNode *) malloc(sizeof(struct WordNode));
    strncpy(node->word, word, MAX_WORD_LEN);
    node->count = 1;
    node->next = hashTable[index];
    hashTable[index] = node;
}

// 打印哈希表中的单词及其出现次数
void printWordFreq() {
    printf("Word Frequency:\n");
    for (int i = 0; i < HASH_SIZE; i++) {
        struct WordNode *node = hashTable[i];
        while (node != NULL) {
            printf("%s: %d\n", node->word, node->count);
            node = node->next;
        }
    }
}

// 清理哈希表
void cleanup() {
    for (int i = 0; i < HASH_SIZE; i++) {
        struct WordNode *node = hashTable[i];
        while (node != NULL) {
            struct WordNode *temp = node;
            node = node->next;
            free(temp);
        }
        hashTable[i] = NULL;
    }
}

// 主函数
int main() {
    FILE *fp;
    char filename[100];
    char word[MAX_WORD_LEN];
    char ch;

    // 初始化哈希表
    for (int i = 0; i < HASH_SIZE; i++) {
        hashTable[i] = NULL;
    }

    // 输入文件名
    printf("Enter the filename: ");
    scanf("%s", filename);

    // 打开文件
    fp = fopen(filename, "r");
    if (fp == NULL) {
        perror("Error opening file");
        return 1;
    }

    // 读取单词并统计
    while (fscanf(fp, "%s", word) == 1) {
        // 转换为小写
        for (int i = 0; word[i]; i++) {
            word[i] = tolower(word[i]);
        }
        // 去除末尾的标点符号
        int len = strlen(word);
        if (ispunct(word[len - 1])) {
            word[len - 1] = '\0';
        }
        // 插入单词到哈希表
        insertWord(word);
    }

    // 关闭文件
    fclose(fp);

    // 输出词频统计结果
    printWordFreq();

    // 清理哈希表
    cleanup();

    return 0;
}

解释示例代码：

哈希表结构：使用哈希表来存储单词及其出现次数，解决查找效率问题。
单词处理：将读取的单词转换为小写并去除末尾的标点符号，保证统计准确性。
文件操作：使用标准C库函数进行文件的打开和读取操作。
内存管理：使用动态内存分配来管理哈希表节点，需要在程序结束时进行内存清理操作。

这个示例程序展示了如何使用C语言实现基本的英文文本词频统计功能，涵盖了文件操作、字符串处理、哈希表数据结构和内存管理等核心概念。

关键字：C语言, 英文文本, 词频统计, 哈希表, 文件操作.