提问者:小点点

如何统计每个单词在多个文本文件中的出现次数


我想数数每一个单词在一堆课文中的出现次数。 我能够计算每个单词及其在一个文本中的出现(我将每个单词插入到二叉树中的文本中,当我在文本中再次找到它时,将其递增一),但我不能计算所有文本中的每个单词。

愚蠢的例子:

课文1:你好,亚当,我叫亚当,我想要快乐。 短信2:你好,山姆,他是哪里人?

(在一篇课文中出现的每个单词)树1:亚当,2你好,1我,1名字,1是,1我,1想要,1 to,1 be,1 happy,1(在所有课文中出现的每个单词)树2:你好,2亚当,1山姆,1 where,1是,2.。。

有人能给我解释一下我能做什么吗? 或者帮我做一个这样做的算法?

我的文件:

节点。hpp

class Node{

    private:
        Node *left;                     //left child
        Node *right;                    //right child
        std::string num;
    public:
        int data;                       //number
        Node();                         //constructor
        void setData(string num, int data);         //sets number in node
        string getData();                   //return numbers from node
        int &getOcc();
        void setLeft(Node *l);          //sets left child pointer
        Node* &getLeft();                //returns left child pointer
        void setRight(Node *r);         //sets right child pointer
        Node* &getRight();               //return right child pointer
};

node.cpp

Node::Node(){
    this->left = NULL;
    this->right = NULL;
}


void Node::setData(string num, int data){
    this->num = num;
    this->data = data;
}


string Node::getData(){
    return this->num;
}

int &Node::getOcc(){
    return this->data;
}


void Node::setLeft(Node *l){
    this->left = l;
}

Node* &Node::getLeft(){
    return this->left;
}

void Node::setRight(Node *r){
    this->right = r;
}

Node* &Node::getRight(){
    return this->right;
}

BST.HPP

//BST class
class BST{

    private:
        Node * root;        //root node pointer

    public:
        BST();                                  //constructor
        ~BST();                                 //destructor
        void Insert(string num, int data);      //Inserts new number in tree
        void InsertIDF(string num, int data);      //Inserts new number in tree
        bool find(string num);                 //finds whether a number is present in tree
        void min();                             //find and print minimum number in the tree
        void max();                             //find and print maximum number in the tree
        void save_file(string filename);        //save the tree to file
        void Delete(string num);                //deletes a number from tree
        void LoadFromFile(string filename);     //loads numbers from file to tree
        void Print();                           //print tree to stdout


        //private functions used as helper functions in the public operations
    private:
        void printHelper(Node *root);
        bool findHelper(Node *root,string num);
        void InsertHelper(Node * &current, string num, int data);
        void InsertHelperIDF(Node * &current, string num, int data);
        void findMinHelper(Node* current);
        void findMaxHelper(Node * current);
        void saveHelper(ofstream &fout, Node* current);
        Node* DeleteHelper(Node *current, string num);
        Node * findMaximum(Node * n);
        void clear(Node *currnt);
};

bst.cpp

BST::BST(){
    this->root = NULL;      //root is NULL in the start
}

BST::~BST(){
    clear(root);            //delete all nodes
}


void BST::clear(Node* current){
    if(current == NULL)
        return;

    clear(current->getLeft());          //clear left subtree
    clear(current->getRight());         //clear right subtree
    delete current;                     //delete this node
}


void BST::Insert(string num, int data){
    InsertHelper(root,num,data);   //call helper to insert
}


void BST::InsertHelper( Node * &current, string num, int data ){
    if ( current == nullptr ){
        // create new node to be inserted
        current = new Node();
        current->setData( num, data );
        current->setLeft( nullptr );
        current->setRight( nullptr );
    } else if ( num < current->getData() ){
        InsertHelper( current->getLeft(), num, data );
    } else if ( current->getData() < num ){
        InsertHelper( current->getRight(), num, data );
    } else {
        int h = current->getOcc();
        h++;
        current->setData(num, h);
    }
}


void BST::InsertIDF(string num, int data){
    InsertHelperIDF(root,num,data);   //call helper to insert
}


void BST::InsertHelperIDF( Node * &current, string num, int data){
    if ( current == nullptr ){
        // create new node to be inserted
        current = new Node();
        current->setData( num, data );
        current->setLeft( nullptr );
        current->setRight( nullptr );
    } else if ( num < current->getData() ){
        InsertHelperIDF( current->getLeft(), num, data );
    } else if ( current->getData() < num ){ 
        InsertHelperIDF( current->getRight(), num, data );
    }
}

void BST::min(){
    findMinHelper(root);
}

void BST::findMinHelper(Node* current){
    if(current == NULL)
        return;

    if(current->getLeft() == NULL)          //if no node at right
        cout<<current->getData();           //current has min data
    else
        findMinHelper(current->getLeft());  //check on left subtree
}

void BST::max(){
    findMaxHelper(root);
}

void BST::findMaxHelper(Node * current){
    if(current == NULL)
        return;

    if(current->getRight() == NULL)             //if no node at right
        cout<<current->getData();               //current node has max data
    else
        findMaxHelper(current->getRight());     //check on right subtree
}



void BST::Print(){
    printHelper(root);
}

void BST::printHelper(Node *current){
    if(current == NULL)     //stop if NULL
        return;

    printHelper(current->getLeft());        //print left tree
    cout<<current->getData() << " " << current->getOcc() << " ";        //print current node data
    printHelper(current->getRight());       //print right tree
}

void BST::Delete(string num){
    root = DeleteHelper(root,num);
}

Node* BST::DeleteHelper(Node *current, string num){
    if(current == NULL)
        return NULL;

    Node *tobeReturned;

    if (current->getData() == num) {          //if key is found

        if (current->getLeft() == NULL) {        //no node at left

            tobeReturned = current->getRight();
            delete current;
            return tobeReturned;          //right subtree should replace this node

        } else if (current->getRight() == NULL) {

            tobeReturned = current->getLeft();
            delete current;
            return tobeReturned;
        } else {

            //find maximum node in the left subtree
            Node * maxnode = findMaximum(current->getLeft());

            //copy values from max node to this node
            //      current->setData(maxnode->getData());

            //delete the max node
            current->setLeft(DeleteHelper(current->getLeft(), num));
        }
        cout<<"Deleted!!!";
    } else {        //not found
        if (num < current->getData()) {
            current->setLeft(DeleteHelper(current->getLeft(),num));
        } else {
            current->setRight(DeleteHelper(current->getRight(), num));
        }
    }
    return current;
}

Node* BST::findMaximum(Node * n){
    if(n->getRight() == NULL)       //if no node at right, current is maximum
        return n;
    return findMaximum(n->getRight());      //find in right subtree
}

这是我的main.cpp

int x = 0;
    // go through each story
    for( Histoire * histoire : * histoires ) {
        // go through each sentence
        for( Phrase p : * histoire ) {
           // go through each word
            for ( Phrase w : p ){
                std::stringstream sstream;
                sstream << w;
                std::string s = sstream.str(); 
                tree.Insert(s , 1);   // here i insert each word in a tree and count its occurence in one text
            }


            // treeIDF.Insert(t,1);

        };

共1个答案

匿名用户

嗯,这看起来确实像是一个家庭作业:)

你确定二叉树是适合这类问题的数据结构吗? 正如评论中所建议的,最好是简单地使用std::map。

#include <fstream>
#include <iostream>
#include <map>
#include <string>
#include <vector>

void processFile(const std::string &filename,
                 std::map<std::string, int> &words_count) {
  std::string word;
  std::ifstream load_file;

  load_file.open(filename.c_str(), std::ifstream::in);
  while (load_file >> word) {
    if (words_count.find(word) == words_count.end()) {
      words_count[word] = 1;
    } else {
      words_count[word]++;
    }
  }
}

int main() {
  const std::vector<std::string> files_to_process{"text1.txt", "text2.txt"};
  std::map<std::string, int> words_count{};
  for (const auto &file : files_to_process) {
    processFile(file, words_count);
  }
  for (const auto &w : words_count) {
    std::cout << w.first << " " << w.second << std::endl;
  }
}

不过,这需要改进--比如去掉特殊字符,将其打包到类中等等。如果你不能使用STL(无论出于何种原因),只需自己创建一个键值结构并实现一些搜索。