#include <iostream> #include <set> #include <vector> using namespace std; const int MaxVocabNum = 50; char *postingList[] = { "my dog has flea problem help please", "maybe not take him to dog park stupid", "my dalmation is so cute I love him", "stop posting stupid worthless garbage", "mr licks ate my steak how to stop him", "quit buying worthless dog food stupid" }; bool Category[] = { false, true, false, true, false, true }; set<string> st; vector<string> v; int wordVec[MaxVocabNum]; float p0Num[MaxVocabNum], p1Num[MaxVocabNum], p0Vect[MaxVocabNum], p1Vect[MaxVocabNum]; float p0Denom = 0.0, p1Denom = 0.0, pAbusive = 0.0; int createVocabList() //建词典 { int num = sizeof(postingList) / sizeof(*postingList); for(int i = 0; i < num; ++i) { // printf("%s\n", postingList[i]); char tmp[MaxVocabNum]; strcpy(tmp, postingList[i]); char *p = strtok(tmp, " "); while(p) { string t(p); st.insert(t); p = strtok(NULL, " "); } } set<string>::iterator iter; int i = 0; for(iter = st.begin(); iter != st.end(); ++iter) { v.push_back(*iter); i++; } return i; } void setOfWord2Vec(char str[], const int vocabNum) //转为词向量 { memset(wordVec, 0, sizeof(wordVec)); char *p = strtok(str, " "); while(p) { string t(p); vector<string>::iterator iter; for(int i = 0; i < v.size(); ++i) { if(t == v[i]) { wordVec[i]++; } } p = strtok(NULL, " "); } } int trainNaiveBayes() { int vocabNum = createVocabList(); cout << "字典为:" << endl; for(int i = 0; i != vocabNum; ++i) cout << v[i] << " "; cout << endl; int num = sizeof(postingList) / sizeof(*postingList); int positive = 0, nagetive = 0; for(int i = 0; i < num; ++i) { char tmp[MaxVocabNum]; strcpy(tmp, postingList[i]); setOfWord2Vec(tmp, vocabNum); for(int i = 0; i != vocabNum; ++i) cout << wordVec[i] << " "; cout << endl; if(Category[i]) { nagetive++; for(int j = 0; j < vocabNum; ++j) { p1Num[j] += wordVec[j]; p1Denom += wordVec[j]; } } else { positive++; for(int j = 0; j < vocabNum; ++j) { p0Num[j] += wordVec[j]; p0Denom += wordVec[j]; } } } for(int j = 0; j < vocabNum; ++j) { p0Num[j] /= p0Denom; p1Num[j] /= p1Denom; } cout << endl << "p0Denom " << p0Denom << endl; for(int j = 0; j < vocabNum; ++j) cout << p0Num[j] << " "; cout << endl << endl; cout << "p1Denom " << p1Denom << endl; for(int j = 0; j < vocabNum; ++j) cout << p1Num[j] << " "; cout << endl << endl; pAbusive = (float)nagetive / num; cout << "pAbusive: " << pAbusive << endl; return vocabNum; } bool classify(char *str, int vocabNum) { setOfWord2Vec(str, vocabNum); float p1 = 0.0, p0 = 0.0; for(int i = 0; i < vocabNum; ++i) { p0 += (wordVec[i] * p0Num[i]); p1 += (wordVec[i] * p1Num[i]); } p1 *= pAbusive; p0 *= (1 - pAbusive); cout << "p0: " << p0 << " " << "p1: " << p1 << endl; return p1 > p0; } int main(void) { int vocabNum = trainNaiveBayes(); char str[] = "I love you"; if(classify(str, vocabNum)) cout << "是脏话" << endl; else cout << "不是脏话" << endl; return 0; }