うまくいかないNB - KIWAM_KEN

本格的に勉強したいないぁと思い最近入門機械学習を少しづつ読み進めている．

入門機械学習

作者: Drew Conway,John Myles White,萩原正人,奥野陽,水野貴明,木下哲也
出版社/メーカー: オライリージャパン
発売日: 2012/12/22
メディア: 大型本
購入: 2人クリック: 41回
この商品を含むブログ (6件) を見る

コードはRで書かれていてGithubから一括DLできる。

とはいえR読めないし書けないマンなので，RubyとC++でおんなじようなことをしてみたのだが…

目的：メール本文からSpamどうか判定する

使用データ：付録のメールデータ

Rubyで各メールのBOWデータを作成

作成データ

>||

298 spool:1 msgchange:5 columns:1 packed:1 addition:1 handle:1 get:2 changed:1 other:3 lengthen:1 exmh:7 unusual:1 …

83 check:1 save:1 never:1 tue:1 intended:1 didnt:1 aug:2…

||<

先頭の数字が単一の単語数

訓練データは付録のcsvデータ（スパム、非スパムそれぞれで500件づつだったかな）

>||

"term","frequency","density","occurrence"

"abandoned",2,9.41132182014964e-05,0.002

"ability",2,9.41132182014964e-05,0.002

"abiword",4,0.000188226436402993,0.002

…

||<

このoccurrenceを使ってメールごとの条件付き確率を求める。

スパム、非スパムの事前確率は0.5，未知語はラプラススムージングはせずに定数(0.0001)で扱う。アンダーフローを抑えるために対数とって総和にする。

汚いコードはいずれ直るといいですね…

>|c|

#include <iostream>

#include <fstream>

#include <cstdlib>

#include <cmath>

#include <iomanip>

#include <vector>

#include <algorithm>

#include <map>

#include <sstream>

using namespace std;

std::map<string,double> Spam;

std::map<string,double> Eham;

const double C = 0.0001;

const double PriorS = 0.5;

const double PriorE = 0.5;

void readUID(char *fn1, std::map<string,double>& map){

ifstream fin;

int i,j,f;

double v,w;

char c;

string str,str2,gomi;

fin.open(fn1);

if(!fin){

cerr << "ERROR: Failed to open file" << endl;

exit(1);

}

getline(fin,gomi);

while(1){

getline(fin,gomi,'"');

getline(fin,str,'"');

getline(fin,gomi,',');

fin >> f >> c >> v >> c >> w;

//cout << str << " " << f << " " << v << " " << w << endl;

if(fin.eof()==1) break;

map.insert(pair<string,double>(str,w));

}

cout << endl;

fin.close();

}

double calNB(map<string,double> D, map<string,int> M, const double P){

int i;

double v=log(P);

//cout << v << endl;

map<string,int>::iterator it = M.begin();

while(it != M.end()){

if(it->second >= 2){

//cout << it->first << " " << D[it->first] << endl;

if(D.count(it->first) == 0){

//xv += log(C);

}else{

//cout << it->first << " " << D[it->first] << endl;

v += log(D[it->first]);

}

//cout << v << endl;

}

it++;

}

return v;

}

void readFile(char *fn1)

{

ifstream fin;

int i,j,Dm=0,Dc=0,size,freq;

double s,e,v;

char c;

string str,str2,gomi;

std::stringstream ss;

map<string,int> MapA;

fin.open(fn1);

if(!fin){

cerr << "ERROR: Failed to open file" << endl;

exit(1);

}

while(1){

getline(fin,gomi,' ');

if(fin.eof()==1) break;

ss << gomi;

ss >> size;

ss.str("");

ss.clear(stringstream::goodbit);

for(j=0;j<size;j++){

getline(fin,str,':');

getline(fin,str2,' ');

ss << str2;

ss >> freq;

//cout << j+1 << " " << str << " " << str2 << " " << freq << endl;

MapA.insert(pair<string,int>(str,freq));

ss.str(""); //

ss.clear(stringstream::goodbit);

}

s = calNB(Spam,MapA,PriorS);

e = calNB(Eham,MapA,PriorE);

if(s >= e){

printf("%e %e SPAM\n",s,e);

}else{

printf("%e %e HAM\n",s,e);

Dc++;

}

MapA.clear();

Dm++;

}

printf("分類精度=%f\n",(double)Dc/(double)Dm);

fin.close();

}

int main(int argc, char **argv){

readUID(argv[1],Spam);//read UID1

cout << "read OK" << endl;

readUID(argv[2],Eham);//read UID2

cout << "read OK" << endl;

readFile(argv[3]); //readRT.lbl

cout << "print OK\n";

return 0;

}

<||

結果

||>

サンプル

SPAM　　　　　　HAM　　　　判定

--------------------------------

-1.256445e+02 -1.793793e+02 SPAM

-3.919911e+01 -6.407220e+01 SPAM

-3.890017e+01 -6.789437e+01 SPAM

-1.519796e+02 -2.220868e+02 SPAM

-1.756950e+01 -2.413611e+01 SPAM

-1.554876e+02 -2.526647e+02 SPAM

-2.959326e+01 -7.078474e+01 SPAM

-4.443161e+01 -1.037077e+02 SPAM

-4.753398e+01 -8.619887e+01 SPAM

…

…これはおかしい

分類精度が20％とかそれくらいになってしまって参考書とも全然違う（むしろ逆になればちょうどいい？）

おかしい可能性があるところ

・資料と文字列処理が異なるのでそれが原因になっている

・Cコードが間違えている

再先不安ですね…