import pandas as pd
from pandas import DataFrame
df_tennis=pd.read_csv("D:\DSet.csv")
print(df_tennis)
def entropy(probs):
    import math
    return sum( [-prob*math.log(prob, 2) for prob in probs] )
def entropy_of_list(a_list):
    from collections import Counter
    ent=Counter(x for x in a_list)
    num_instances=len(a_list)
    probs=[x/num_instances for x in ent.values()]
    return entropy(probs)
total_entropy=entropy_of_list(df_tennis['PT'])
print("\nTotal entropy of play tennis data set:",total_entropy)
def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    print("Information Gain Calculation of ",split_attribute_name)
    print("target_attribute_name",target_attribute_name)
    df_split = df.groupby(split_attribute_name)
    for name,group in df_split:
        print("Name: ",name)
        print("Group: ",group)
    nobs = len(df.index) * 1.0
    print("NOBS",nobs) 
    df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
    print("df_agg_ent",df_agg_ent)
    df_agg_ent.columns=['Entropy','Probs1']
    avg_info = sum( df_agg_ent['Entropy'] * df_agg_ent['Probs1'] )
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy - avg_info
print('Info-gain for Outlook is :'+str(information_gain(df_tennis, 'Outlook', 'PT')),"\n")
def id3(df, target_attribute_name, attribute_names, default_class=None):
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute_name])
    if len(cnt) == 1:
        return next(iter(cnt))
    elif df.empty or (not attribute_names):
        return default_class
    else:
        default_class = max(cnt.keys())
    print("attribute_names:",attribute_names)
    gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] 
    index_of_max = gainz.index(max(gainz)) 
    best_attr = attribute_names[index_of_max] 
    tree = {best_attr:{}} 
    remaining_attribute_names = [i for i in attribute_names if i != best_attr]
    for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names,
                        default_class)
            tree[best_attr][attr_val] = subtree
    return tree
attribute_names = list(df_tennis.columns)
print("List of Attributes:", attribute_names) 
attribute_names.remove('PT') 
print("Predicting Attributes:", attribute_names)
from pprint import pprint
tree = id3(df_tennis,'PT',attribute_names)
print("\n\nThe Resultant Decision Tree is :\n")
pprint(tree)
attribute = next(iter(tree))
print("Best Attribute :\n",attribute)
print("Tree Keys:\n",tree[attribute].keys())
def classify(instance, tree, default=None):
    attribute = next(iter(tree))
    print("Key:",tree.keys())
    print("Attribute:",attribute)
    print("Insance of Attribute :",instance[attribute],attribute)
    if instance[attribute] in tree[attribute].keys(): 
        result = tree[attribute][instance[attribute]]
        print("Instance Attribute:",instance[attribute],"TreeKeys :",tree[attribute].keys())
        if isinstance(result, dict): 
            return classify(instance, result)
        else:
            return result 
    else: 
        return default
df_tennis['predicted'] = df_tennis.apply(classify, axis=1, args=(tree,'No') ) 
print(df_tennis['predicted'])
print('\n Accuracy is:\n' + str( sum(df_tennis['PT']==df_tennis['predicted'] ) / (1.0*len(df_tennis.index)) ))
df_tennis[['PT', 'predicted']]
training_data = df_tennis.iloc[1:-4] 
test_data  = df_tennis.iloc[-4:]
train_tree = id3(training_data, 'PT', attribute_names)
test_data['predicted2'] = test_data.apply(  
classify, axis=1, args=(train_tree,'Yes') ) 
print ('\n\n Accuracy is : ' + str( sum(test_data['PT']==test_data['predicted2'] ) / (1.0*len(test_data.index)) ))
import pandas as pd
from pandas import DataFrame
df_tennis=pd.read_csv('D:\DSet.csv')
print(df_tennis)
def entropy(probs):  

    import math

    return sum( [-prob*math.log(prob, 2) for prob in probs] )
def entropy_of_list(a_list):

    from collections import Counter

    cnt = Counter(x for x in a_list) 

    num_instances = len(a_list)

    probs = [x / num_instances for x in cnt.values()] 

    return entropy(probs)
total_entropy=entropy_of_list(df_tennis['PT'])
print("\nTotal entropy of play tennis data set:",total_entropy)
def information_gain(df, split_attribute_name, target_attribute_name, trace=0):

    print("Information Gain Calculation of ",split_attribute_name)

    print("target_attribute_name",target_attribute_name)

    df_split = df.groupby(split_attribute_name)

    for name,group in df_split:

        print("Name: ",name)

        print("Group: ",group)

    nobs = len(df.index) * 1.0

    print("NOBS",nobs) 

    df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]

    print("df_agg_ent",df_agg_ent)
    
    df_agg_ent.columns=['Entropy','Probs1']

    avg_info = sum( df_agg_ent['Entropy'] * df_agg_ent['Probs1'] )

    old_entropy = entropy_of_list(df[target_attribute_name])

    return old_entropy - avg_info
print('Info-gain for Outlook is :'+str(information_gain(df_tennis, 'Outlook', 'PT')),"\n")
def id3(df, target_attribute_name, attribute_names, default_class=None):



    from collections import Counter

    cnt = Counter(x for x in df[target_attribute_name])

    if len(cnt) == 1:

        return next(iter(cnt))

 

    elif df.empty or (not attribute_names):

        return default_class

    else:

        default_class = max(cnt.keys())

    print("attribute_names:",attribute_names)

    gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] 



    index_of_max = gainz.index(max(gainz)) 

    best_attr = attribute_names[index_of_max] 



    tree = {best_attr:{}} 

    remaining_attribute_names = [i for i in attribute_names if i != best_attr]

    for attr_val, data_subset in df.groupby(best_attr):

            subtree = id3(data_subset,

                        target_attribute_name,

                        remaining_attribute_names,

                        default_class)

            tree[best_attr][attr_val] = subtree

    return tree
attribute_names = list(df_tennis.columns)

print("List of Attributes:", attribute_names) 

attribute_names.remove('PT') 

print("Predicting Attributes:", attribute_names)
from pprint import pprint

tree = id3(df_tennis,'PT',attribute_names)

print("\n\nThe Resultant Decision Tree is :\n")

pprint(tree)

attribute = next(iter(tree))

print("Best Attribute :\n",attribute)

print("Tree Keys:\n",tree[attribute].keys())
def classify(instance, tree, default=None):

    attribute = next(iter(tree))

    print("Key:",tree.keys())

    print("Attribute:",attribute)

    print("Insance of Attribute :",instance[attribute],attribute)

    if instance[attribute] in tree[attribute].keys(): 

        result = tree[attribute][instance[attribute]]

        print("Instance Attribute:",instance[attribute],"TreeKeys :",tree[attribute].keys())

        if isinstance(result, dict): 

            return classify(instance, result)

        else:

            return result 

    else:

        return default
df_tennis['predicted'] = df_tennis.apply(classify, axis=1, args=(tree,'No') ) 

print(df_tennis['predicted'])

print('\n Accuracy is:\n' + str( sum(df_tennis['PT']==df_tennis['predicted'] ) / (1.0*len(df_tennis.index)) ))

df_tennis[['PT', 'predicted']]
training_data = df_tennis.iloc[1:-4] 

test_data  = df_tennis.iloc[-4:]

train_tree = id3(training_data, 'PT', attribute_names)

test_data['predicted2'] = test_data.apply(  

classify, axis=1, args=(train_tree,'Yes') ) 

print ('\n\n Accuracy is : ' + str( sum(test_data['PT']==test_data['predicted2'] ) / (1.0*len(test_data.index)) ))