Decision Tree Program for Classification
Program : -
#include<bits/stdc++.h>
using namespace std;
//STRUCTURE OF A NODE IN DECISION TREE
struct TreeNode
{
string splitAttr;
string label;
bool isleaf;
vector<vector<string> >Table;
vector<TreeNode*>children;
};
//Print a Table
void printTable(vector<vector<string> >&Table)
{
for(int i=0;i<Table.size();i++)
{
char arr[30];
for(int j=0;j<Table[i].size();j++)
printf("%-15s ",strcpy(arr,Table[i][j].c_str()));
cout<<endl;
}
}
//Splitting Function
TreeNode *createTree(TreeNode *node,vector<vector<string> >&input)
{
node->isleaf=false;
node->Table=input;
int rows=input.size(),col=input[0].size(),cnt=0;
map<string,int>mp;
for(int i=1;i<rows;i++) mp[input[i][col-1]]++,cnt++;
//Base case (When all class is same.)
if(cnt==mp[input[1][col-1]])
{
TreeNode *temp=new TreeNode();
temp->isleaf=true;
temp->label=input[1][col-1];
temp->splitAttr="LeafNode";
node->splitAttr="Final";
node->children.push_back(temp);
return node;
}
//Distribution Information Gain.
double infoGain=0.0,tot=cnt,temp;
for(map<string,int>::iterator it=mp.begin();it!=mp.end();it++)
temp=it->second,infoGain+=-temp/tot*log2(temp/tot);
//Calculation of Information & gain of each attribute.
double mx=-1.0; int cl_num=-1;
for(int i=0;i<col-1;i++)
{
map<string,vector<string> >mps;
for(int j=1;j<rows;j++) mps[input[j][i]].push_back(input[j][col-1]);
double gain=0.0;
for(map<string,vector<string> >::iterator it=mps.begin();it!=mps.end();it++)
{
map<string,int>mpss; cnt=0;
for(int k=0;k<it->second.size();k++) mpss[it->second[k]]++,cnt++;
map<string,int>::iterator its=mpss.begin();
if(its->second==cnt) continue;
double infoAttr=0.0; tot=cnt;
for( ;its!=mpss.end();its++)
temp=its->second,infoAttr+=-temp/tot*log2(temp/tot);
gain+=((double)it->second.size()/((double)(rows-1)))*infoAttr;
}
gain=infoGain-gain;
if(gain>mx) mx=gain,cl_num=i;
//cout<<input[0][i]<<" "<<gain<<endl;
}
//Splitting attribute. (Decided on the basis of maximum gain value.
node->splitAttr=input[0][cl_num];
set<string>st;
for(int i=1;i<rows;i++)
st.insert(input[i][cl_num]);
for(set<string>::iterator it=st.begin();it!=st.end();it++)
{
TreeNode *tmp=new TreeNode();
tmp->label=*it;
vector<vector<string> >nyatable;
for(int i=0;i<rows;i++)
{
vector<string>tts;
if(i==0 || input[i][cl_num]==*it)
{
for(int j=0;j<col;j++)
if(j!=cl_num) tts.push_back(input[i][j]);
nyatable.push_back(tts);
}
}
cout<<"===================================================================\n";
cout<<"--------------Splitting attribute : "<<node->splitAttr<<"------------\n";
cout<<"--------------Label : "<<tmp->label<<"------------\n";
printTable(nyatable);
tmp=createTree(tmp,nyatable);
node->children.push_back(tmp);
}
return node;
}
//Predicting the decision.
string prediction(TreeNode *node,map<string,string>mp)
{
if(node->isleaf) return node->label;
string label=mp[node->splitAttr];
if(node->splitAttr.compare("Final")==0) return node->children[0]->label;
for(int i=0;i<node->children.size();i++)
{
if(label.compare(node->children[i]->label)==0)
return prediction(node->children[i],mp);
}
return "Can't be predicted\n";
}
int main()
{
//Training File Input to the InputTable.
char buff[1000];
FILE *fp=fopen("Train.txt","r");
if(!fp) { cout<<"No training data found.\n"; return 0;}
if(fgets(buff,1000,fp)==NULL) { cout<<"Training file is empty.\n"; return 0;}
vector<vector<string> >InputTable;
vector<string>temp; string tt="";
for(int i=0;i<strlen(buff);i++)
{
if(buff[i]==' '||buff[i]=='\t'||buff[i]=='\n')
{
temp.push_back(tt); tt="";
while(buff[i]==' '||buff[i]=='\t'||buff[i]=='\n') i++; i--;
}
else tt+=buff[i];
}
temp.push_back(tt);
InputTable.push_back(temp);
temp.resize(0); tt="";
while(fgets(buff,1000,fp)!=NULL)
{
for(int i=0;i<strlen(buff);i++)
{
if(buff[i]==' '||buff[i]=='\t'||buff[i]=='\n')
{
temp.push_back(tt); tt="";
while(buff[i]==' '||buff[i]=='\t'||buff[i]=='\n') i++; i--;
}
else tt+=buff[i];
}
temp.push_back(tt);
InputTable.push_back(temp);
temp.resize(0); tt="";
}
for(int i=0;i<InputTable.size()-1;i++) InputTable[i].pop_back();
cout<<"-----------Input Matrix------------\n";
//Creating Decision tree.
TreeNode *root=new TreeNode();
root->label="Input_Table";
cout<<"------------Label = "<<root->label<<"---------------\n";
printTable(InputTable);
root=createTree(root,InputTable);
//Prediction.
int samp; cout<<"Enter number of prediction : "; cin>>samp;
while(samp--)
{
map<string,string>mp;
string inp;
for(int i=0;i<InputTable[0].size()-1;i++)
cout<<"Enter "<<InputTable[0][i]<<" : ",cin>>inp,mp[InputTable[0][i]]=inp;
cout<<"Predicted value : "<<prediction(root,mp)<<endl;
cout<<"=============================================================\n";
}
fclose(fp);
return 0;
}
Training File :-
1.First row should be attribute name with space separated and it should be followed by n rows (tuples). Each tuple must have all the attribute value.
2.This program can't accept empty value of any attribute.
3.Class variable should be the last column of the training set file.
Input correct value is necessary for prediction :-