Pages

Tuesday, 19 September 2017

Decision Tree Program

Decision Tree Program for Classification

Program : -

#include<bits/stdc++.h>

using namespace std;

//STRUCTURE OF A NODE IN DECISION TREE

struct TreeNode

{

    string splitAttr;

    string label;

    bool isleaf;

    vector<vector<string> >Table;

    vector<TreeNode*>children;

};

//Print a Table

void printTable(vector<vector<string> >&Table)

{

    for(int i=0;i<Table.size();i++)

    {

        char arr[30];

        for(int j=0;j<Table[i].size();j++)

            printf("%-15s ",strcpy(arr,Table[i][j].c_str()));

        cout<<endl;

    }

}

//Splitting Function

TreeNode *createTree(TreeNode *node,vector<vector<string> >&input)

{

    node->isleaf=false;

    node->Table=input;

    int rows=input.size(),col=input[0].size(),cnt=0;

    map<string,int>mp;

    for(int i=1;i<rows;i++) mp[input[i][col-1]]++,cnt++;

    //Base case (When all class is same.)

    if(cnt==mp[input[1][col-1]])

    {

        TreeNode *temp=new TreeNode();

        temp->isleaf=true;

        temp->label=input[1][col-1];

        temp->splitAttr="LeafNode";

        node->splitAttr="Final";

        node->children.push_back(temp);

        return node;

    }

    //Distribution Information Gain.

    double infoGain=0.0,tot=cnt,temp;

    for(map<string,int>::iterator it=mp.begin();it!=mp.end();it++)

        temp=it->second,infoGain+=-temp/tot*log2(temp/tot);

    //Calculation of Information & gain of each attribute.

    double mx=-1.0; int cl_num=-1;

    for(int i=0;i<col-1;i++)

    {

        map<string,vector<string> >mps;

        for(int j=1;j<rows;j++) mps[input[j][i]].push_back(input[j][col-1]);

        double gain=0.0;

        for(map<string,vector<string> >::iterator it=mps.begin();it!=mps.end();it++)

        {

            map<string,int>mpss; cnt=0;

            for(int k=0;k<it->second.size();k++) mpss[it->second[k]]++,cnt++;

            map<string,int>::iterator its=mpss.begin();

            if(its->second==cnt) continue;

            double infoAttr=0.0; tot=cnt;

            for( ;its!=mpss.end();its++)

                temp=its->second,infoAttr+=-temp/tot*log2(temp/tot);

            gain+=((double)it->second.size()/((double)(rows-1)))*infoAttr;

        }

        gain=infoGain-gain;

        if(gain>mx) mx=gain,cl_num=i;

        //cout<<input[0][i]<<" "<<gain<<endl;

    }

    //Splitting attribute. (Decided on the basis of maximum gain value.

    node->splitAttr=input[0][cl_num];

    set<string>st;

    for(int i=1;i<rows;i++)

        st.insert(input[i][cl_num]);

    for(set<string>::iterator it=st.begin();it!=st.end();it++)

    {

        TreeNode *tmp=new TreeNode();

        tmp->label=*it;

        vector<vector<string> >nyatable;

        for(int i=0;i<rows;i++)

        {

            vector<string>tts;

            if(i==0 || input[i][cl_num]==*it)

            {

                for(int j=0;j<col;j++)

                    if(j!=cl_num) tts.push_back(input[i][j]);

                nyatable.push_back(tts);

            }

        }

        cout<<"===================================================================\n";

        cout<<"--------------Splitting attribute : "<<node->splitAttr<<"------------\n";

        cout<<"--------------Label : "<<tmp->label<<"------------\n";

        printTable(nyatable);

        tmp=createTree(tmp,nyatable);

        node->children.push_back(tmp);

    }

    return node;

}

//Predicting the decision.

string prediction(TreeNode *node,map<string,string>mp)

{

    if(node->isleaf) return node->label;

    string label=mp[node->splitAttr];

    if(node->splitAttr.compare("Final")==0) return node->children[0]->label;

    for(int i=0;i<node->children.size();i++)

    {

        if(label.compare(node->children[i]->label)==0)

            return prediction(node->children[i],mp);

    }

    return "Can't be predicted\n";

}

int main()

{

    //Training File Input to the InputTable.

    char buff[1000];

    FILE *fp=fopen("Train.txt","r");

    if(!fp) { cout<<"No training data found.\n"; return 0;}

    if(fgets(buff,1000,fp)==NULL) { cout<<"Training file is empty.\n"; return 0;}

    vector<vector<string> >InputTable;

    vector<string>temp; string tt="";

    for(int i=0;i<strlen(buff);i++)

    {

        if(buff[i]==' '||buff[i]=='\t'||buff[i]=='\n')

        {

            temp.push_back(tt); tt="";

            while(buff[i]==' '||buff[i]=='\t'||buff[i]=='\n') i++; i--;

        }

        else    tt+=buff[i];

    }

    temp.push_back(tt);

    InputTable.push_back(temp);

    temp.resize(0); tt="";

    while(fgets(buff,1000,fp)!=NULL)

    {

        for(int i=0;i<strlen(buff);i++)

        {

            if(buff[i]==' '||buff[i]=='\t'||buff[i]=='\n')

            {

                temp.push_back(tt); tt="";

                while(buff[i]==' '||buff[i]=='\t'||buff[i]=='\n') i++; i--;

            }

            else    tt+=buff[i];

        }

        temp.push_back(tt);

        InputTable.push_back(temp);

        temp.resize(0); tt="";

    }

    for(int i=0;i<InputTable.size()-1;i++) InputTable[i].pop_back();

    cout<<"-----------Input Matrix------------\n";

    //Creating Decision tree.

    TreeNode *root=new TreeNode();

    root->label="Input_Table";

    cout<<"------------Label = "<<root->label<<"---------------\n";

    printTable(InputTable);

    root=createTree(root,InputTable);

    //Prediction.

    int samp; cout<<"Enter number of prediction : "; cin>>samp;

    while(samp--)

    {

        map<string,string>mp;

        string inp;

        for(int i=0;i<InputTable[0].size()-1;i++)

            cout<<"Enter "<<InputTable[0][i]<<" : ",cin>>inp,mp[InputTable[0][i]]=inp;

        cout<<"Predicted value : "<<prediction(root,mp)<<endl;

        cout<<"=============================================================\n";

    }

    fclose(fp);

    return 0;

}


Training File :- 

1.First row should be attribute name with space separated and it should be followed by n rows (tuples). Each tuple must have all the attribute value. 
2.This program can't accept empty value of any attribute.
3.Class variable should be the last column of the training set file.

Input correct value is necessary for prediction :-