Joseph Kliegman / Feb 27 2019

Equipment Success Prediction

from __future__ import print_function
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D 
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow import keras
import keras
import sklearn
from tensorflow.keras import layers

from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
keras.__version__
sklearn.__version__
! ls
equipment_success_unique.csv
Language:Python

equipment_success_dataframe = pd.read_csv(equipment_success_unique.csv, sep=",")

equipment_success_dataframe['decades_old'] = round(equipment_success_dataframe['age_in_months_at_equipped_start'] / 120)
equipment_success_dataframe['years'] = round(equipment_success_dataframe['age_in_months_at_equipped_start'] / 12)

def isMale(x):
    if x == "male":
        return 1
    return 0
def isFemale(x):
    if x == "female":
        return 1
    return 0
def isRightEar(x):
    if x == "R":
        return 1
    return 0
def isLeftEar(x):
    if x == "L":
        return 1
    return 0
def isBothEars(x):
    if x == "BIN":
        return 1
    return 0
def dbToAmp (x):
    return 10**(x/20)

def genderToNum (x):
    if x == "male":
        return 1
    if x == "female":
        return 2
    return 0
            

    

equipment_success_dataframe['genderNum'] = equipment_success_dataframe['gender'].apply(genderToNum)

equipment_success_dataframe['isMale'] = equipment_success_dataframe['gender'].apply(isMale)
equipment_success_dataframe['isFemale'] = equipment_success_dataframe['gender'].apply(isFemale)
equipment_success_dataframe['isRightEar'] = equipment_success_dataframe['ears'].apply(isRightEar)
equipment_success_dataframe['isLeftEar'] = equipment_success_dataframe['ears'].apply(isLeftEar)
equipment_success_dataframe['isBothEars'] = equipment_success_dataframe['ears'].apply(isBothEars)

'''
equipment_success_dataframe['freq_500'] = dbToAmp(equipment_success_dataframe['freq_500'])
equipment_success_dataframe['freq_1000'] = dbToAmp(equipment_success_dataframe['freq_1000'])
equipment_success_dataframe['freq_2000'] = dbToAmp(equipment_success_dataframe['freq_2000'])
equipment_success_dataframe['freq_4000'] = dbToAmp(equipment_success_dataframe['freq_4000'])
equipment_success_dataframe['eq_freq_500'] = dbToAmp(equipment_success_dataframe['eq_freq_500'])
equipment_success_dataframe['eq_freq_1000'] = dbToAmp(equipment_success_dataframe['eq_freq_1000'])
equipment_success_dataframe['eq_freq_2000'] = dbToAmp(equipment_success_dataframe['eq_freq_2000'])
equipment_success_dataframe['eq_freq_4000'] = dbToAmp(equipment_success_dataframe['eq_freq_4000'])
'''

equipment_success_dataframe['freq_500_over_freq_1000'] = equipment_success_dataframe['freq_500']/equipment_success_dataframe['freq_1000']
equipment_success_dataframe['freq_2000_over_freq_1000'] = equipment_success_dataframe['freq_2000']/equipment_success_dataframe['freq_1000']
equipment_success_dataframe['freq_4000_over_freq_1000'] = equipment_success_dataframe['freq_4000']/equipment_success_dataframe['freq_1000']


equipment_success_dataframe['freq_500_gain'] = equipment_success_dataframe['freq_500'] - equipment_success_dataframe['eq_freq_500']
equipment_success_dataframe['freq_1000_gain'] = equipment_success_dataframe['freq_1000'] - equipment_success_dataframe['eq_freq_1000']
equipment_success_dataframe['freq_2000_gain'] = equipment_success_dataframe['freq_2000'] - equipment_success_dataframe['eq_freq_2000']
equipment_success_dataframe['freq_4000_gain'] = equipment_success_dataframe['freq_4000'] - equipment_success_dataframe['eq_freq_4000']

equipment_success_dataframe['freq_500_gain_ratio'] = equipment_success_dataframe['freq_500_gain'] / equipment_success_dataframe['freq_500']
equipment_success_dataframe['freq_1000_gain_ratio'] = equipment_success_dataframe['freq_1000_gain'] / equipment_success_dataframe['freq_1000']
equipment_success_dataframe['freq_2000_gain_ratio'] = equipment_success_dataframe['freq_2000_gain'] / equipment_success_dataframe['freq_2000']
equipment_success_dataframe['freq_4000_gain_ratio'] = equipment_success_dataframe['freq_4000_gain'] / equipment_success_dataframe['freq_4000']

# equipment_success_dataframe['eq_gain_ratio'] = (-equipment_success_dataframe['eq_average_loss'] + equipment_success_dataframe['average_loss'])/equipment_success_dataframe['average_loss']


equipment_success_dataframe = equipment_success_dataframe.drop(columns=["eq_average_loss", "eq_high_loss", "eq_low_loss", "rk"])

equipment_success_dataframe
Language:Python
equipment_success_dataframe['valid'] = (
  #  equipment_success_dataframe['freq_250'].apply(lambda x : not math.isnan(x)) &
    equipment_success_dataframe['freq_500'].apply(lambda x : not math.isnan(x)) &
   # equipment_success_dataframe['freq_750'].apply(lambda x : not math.isnan(x)) &
    equipment_success_dataframe['freq_1000'].apply(lambda x : not math.isnan(x)) &
 #   equipment_success_dataframe['freq_1500'].apply(lambda x : not math.isnan(x)) &
    equipment_success_dataframe['freq_2000'].apply(lambda x : not math.isnan(x)) &
 #   equipment_success_dataframe['freq_3000'].apply(lambda x : not math.isnan(x)) &
    equipment_success_dataframe['freq_4000'].apply(lambda x : not math.isnan(x)) &
#    equipment_success_dataframe['freq_6000'].apply(lambda x : not math.isnan(x)) &
    equipment_success_dataframe['freq_8000'].apply(lambda x : not math.isnan(x)) &
 #   equipment_success_dataframe['eq_freq_500'].apply(lambda x : not math.isnan(x)) &
    equipment_success_dataframe['eq_freq_1000'].apply(lambda x : not math.isnan(x)) &
  #  equipment_success_dataframe['eq_freq_2000'].apply(lambda x : not math.isnan(x)) &
  #  equipment_success_dataframe['eq_freq_4000'].apply(lambda x : not math.isnan(x)) &
    
   # equipment_success_dataframe['freq_500_gain'].apply(lambda x : not math.isnan(x) and x > 0) &
    equipment_success_dataframe['freq_1000_gain'].apply(lambda x : not math.isnan(x) and x > 0) &
   # equipment_success_dataframe['freq_2000_gain'].apply(lambda x : not math.isnan(x) and x > 0) &
   # equipment_success_dataframe['freq_4000_gain'].apply(lambda x : not math.isnan(x) and x > 0) &

    #    equipment_success_dataframe['freq_500_gain_ratio'].apply(lambda x : not math.isnan(x) and x < 1) &
        equipment_success_dataframe['freq_1000_gain_ratio'].apply(lambda x : not math.isnan(x) and x < 1) &
     #   equipment_success_dataframe['freq_2000_gain_ratio'].apply(lambda x : not math.isnan(x) and x < 1) &
      #  equipment_success_dataframe['freq_4000_gain_ratio'].apply(lambda x : not math.isnan(x) and x < 1) &
    
        equipment_success_dataframe['isBothEars'].apply(lambda x : x == 0) &


     equipment_success_dataframe['average_loss'].apply(lambda x : not math.isnan(x)) &
#     equipment_success_dataframe['high_loss'].apply(lambda x : not math.isnan(x)) &
#     equipment_success_dataframe['low_loss'].apply(lambda x : not math.isnan(x)) &
    equipment_success_dataframe['months_since_equipped'].apply(lambda x : x > 12) &
    equipment_success_dataframe['decades_old'].apply(lambda x : not math.isnan(x) and x < 11 and x > 4)
)
equipment_success_dataframe.query(' valid == True')
#equipment_success = equipment_success_dataframe.query('center_id == 29 and valid == True').copy()
equipment_success_valid = equipment_success_dataframe.query(' valid == True').copy()

equipment_success_valid
for feature in ['eq_freq_1000','freq_1000_gain_ratio', 'freq_2000_gain_ratio']:
    display.display(equipment_success_valid.hist(feature))
def scaleInner(df, feature):
    stats = df[feature].describe();
    std = stats['std']
    mean = stats['mean']
    df[feature + '_scaled'] = (df[feature] - mean)/std
    return df

def scale(df, features):
    scaledDf = df.copy()
    
    for feature in features:
        scaleInner(scaledDf, feature)
    return scaledDf        
def success_ratio(equipment_success, freq, multiplier, threshold ):
    data = pd.value_counts(
        equipment_success['freq_' + freq + '_gain'] > (equipment_success['freq_' + freq] * multiplier) - threshold
    )  
    return data[True]/(data[False] + data[True])
[success_ratio(equipment_success_valid, '500', 0.5, 5),
success_ratio(equipment_success_valid, '1000', 0.5, 5),
success_ratio(equipment_success_valid, '2000', 0.5, 5),
success_ratio(equipment_success_valid, '4000', 0.5, 5)]
def model_good_prediction_ratio_array(targets, predictions, max_distance):
    predictionsDiff = list(map(lambda x: abs(x), list(targets - predictions)))
    return len(list(filter(lambda x: x < max_distance, predictionsDiff)))/len(predictionsDiff)
def model_good_prediction_ratio(targets, predictions, max_distance):
    predictionsDiff = (targets - predictions).apply (lambda x: abs(x))
    return len(list(filter(lambda x: x < max_distance, predictionsDiff)))/len(predictionsDiff)
equipment_success = equipment_success_valid.copy() #[(equipment_success_valid['isMale'] == 0)
                                           # & (equipment_success_valid['decades_old'] == 7)
                                           #].copy()

equipment_success
def safe_append(arr, x):
    if arr == None:
        return [x]
    arr.append(x)
    return arr
def safe_inc(n):
    if n == None:
        return 1
    return n + 1
center_ids = equipment_success['center_id'].values
tests_per_center = {}
for i in range(len(center_ids)):
    center_id = center_ids[i]
    tests_per_center[center_id] = safe_inc(tests_per_center.get(center_id))

1.
Keras

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Input, advanced_activations
from keras import optimizers
from keras import regularizers
keras.__version__
equipment_success_valid = scale(equipment_success_valid, ['freq_250', 'freq_500', 'freq_750', 'freq_1000', 
                                                          'freq_1500', 'freq_2000', 'freq_3000', 'freq_4000',
                                                          'freq_6000', 'freq_8000', 'years', 'decades_old', 'age_in_months_at_equipped_start'])


equipment_success_all = equipment_success_valid.copy()
#[(equipment_success_valid['isFemale'] == 1)
#                                            & (equipment_success_valid['decades_old'] == 8)
#                                           ].copy()

equipment_success = equipment_success_all.sample(frac=1)
 
equipment_success
features = [
   "years_scaled",
    "isMale",
    "isFemale",
    #"isLeftEar",
    #"isRightEar",
   # 'freq_250_scaled',
    'freq_500_scaled',
    #'freq_750_scaled',
    'freq_1000_scaled',
    #'freq_1500_scaled',
    'freq_2000_scaled',
    #'freq_3000_scaled', 
    'freq_4000_scaled',
    #'freq_6000_scaled',
    'freq_8000_scaled',
    #'freq_500_over_freq_1000',
    #'freq_2000_over_freq_1000',
    #'freq_4000_over_freq_1000'
    
]

data = (equipment_success[features].values)
labels = (equipment_success['eq_freq_1000'].values)

print("original good prediction ratio: ", model_good_prediction_ratio_array(labels, np.average(labels), 5))
print("original rmse: " , np.std(labels - np.average(labels)))

2.
data exploration

original_data = data.copy()
original_labels = labels.copy()
unscaled_features = [
   "age_in_months_at_equipped_start",
    "isMale",
    "isFemale",
    'freq_250',
    'freq_500',
    'freq_750',
    'freq_1000',
    'freq_1500',
    'freq_2000',
    'freq_3000', 
    'freq_4000',
    'freq_6000',
    'freq_8000',
    
]
ff = equipment_success[unscaled_features].values
labels = equipment_success['eq_freq_1000'].values
[ff.shape, labels.shape]
len(ff)
bad = {}
for i in range(len(ff)):
    if i % 100 == 0:
        print(i)
        print(bad)
    for j in range(i + 1, len(ff)):
        if ff[i][1] == ff[j][1] and ff[i][2] == ff[j][2] and abs(ff[i][0] - ff[j][0]) < 120 and np.linalg.norm(ff[i][3:] - ff[j][3:]) < 10 and abs(labels[i] - labels[j]) >= 5:
        #if ff[i][1] == ff[j][1] and ff[i][2] == ff[j][2] and abs(ff[i][0] - ff[j][0]) < 120 and abs(ff[i][6] - ff[j][6]) <= 5 and abs(labels[i] - labels[j]) > 5:
        #if abs(ff[i][6] - ff[j][6]) <= 5 and abs(labels[i] - labels[j]) > 5:


            if bad.get(i) == None:
                bad[i] = []
            bad[i].append(j)
bad
[ff[0], labels[0]]
[ff[26], labels[26]]
list(bad.keys())
good_data = np.delete(data, list(bad.keys()), axis=0)
good_labels = np.delete(labels, list(bad.keys()), axis=0)
[good_data.shape, good_labels.shape]

3.
SVM

from sklearn import svm
my_features = [
    'center_id',
   'years',
 'genderNum',
#'freq_250',
 'freq_500',
# 'freq_750',
 'freq_1000',
# 'freq_1500',
 'freq_2000',
# 'freq_3000',
 'freq_4000',
# 'freq_6000',
 'freq_8000'
              ]
my_equipment_success = equipment_success.copy()
my_equipment_success['valid'] = (
  my_equipment_success['eq_freq_1000'].apply(lambda x : 20 <= x <= 60) 
)

#my_equipment_success = my_equipment_success.query(' valid == True').copy()
my_equipment_success = my_equipment_success.sample(frac=1)
data = my_equipment_success[my_features].values
labels = (my_equipment_success['eq_freq_1000'].values)
[data.shape, labels.shape]
n = 10000
clf = svm.SVR(gamma='scale', kernel='rbf', C=4, epsilon=4, shrinking=True)
clf.fit(data[0:n], labels[0:n])
predictions = clf.predict(data)
print("training")

print(np.std(labels[0:n]-np.mean(labels)))
print(np.std(labels[0:n]-predictions[0:n]))
print(np.std(labels[0:n]-predictions[0:n]) ** 2)
print(model_good_prediction_ratio_array(labels[0:n], np.mean(labels), 5))
print(model_good_prediction_ratio_array(labels[0:n], predictions[0:n], 5))
print("validation")
print(np.std(labels[n+1:]-np.mean(labels)))
print(np.std(labels[n+1:]-predictions[n+1:]))
print(np.std(labels[n+1:]-predictions[n+1:]) ** 2)
print(model_good_prediction_ratio_array(labels[n+1:], np.mean(labels), 5))
print(model_good_prediction_ratio_array(labels[n+1:], predictions[n+1:], 5))
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVR
X_train, X_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.2, random_state=0)
tuned_parameters = [{'kernel': ['rbf'], 
                     'gamma': ['scale'],
                     'C': [4, 6,10],
                     'epsilon': [ 4, 6, 10],
                     'shrinking': [True]
                    }
                   ]
print("Generating the models")
clf = GridSearchCV(SVR(), tuned_parameters, cv=5, scoring = "neg_mean_squared_error")

clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()
clf.get_params()

4.
Gradient boosting

import lightgbm as lgb
def model_good_prediction_ratio_diffs_array(diffs, max_distance):
    predictionsDiff = list(map(lambda x: abs(x), list(diffs)))
    return len(list(filter(lambda x: x < max_distance, predictionsDiff)))/len(predictionsDiff)
my_equipment_success = equipment_success.copy()

#my_equipment_success = my_equipment_success.query(' valid == True').copy()
my_equipment_success = my_equipment_success.sample(frac=1)
my_features = [
   'center_id',
     'freq_1000',
   'decades_old',
   # 'years',
   'genderNum',
#'freq_250',
 'freq_500',
 #'freq_750',
 #'freq_1500',
 'freq_2000',
 #'freq_3000',
 'freq_4000',
 #'freq_6000',
 #'freq_8000'
              ]

data = my_equipment_success[my_features].values
labels = (my_equipment_success['eq_freq_1000'].values)
losses = (my_equipment_success['freq_1000'].values)
labels_ratio = (my_equipment_success['eq_freq_1000'].values)/losses

[data.shape, labels.shape]
center_ids = my_equipment_success['center_id'].values
tests_per_center = {}
for i in range(len(data)):
    center_id = center_ids[i]
    tests_per_center[center_id] = safe_inc(tests_per_center.get(center_id))
    
labels_per_center = {}
for i in range(len(data)):
    center_id = center_ids[i]
    labels_per_center[center_id] = safe_append(labels_per_center.get(center_id), labels[i])
    
mean_labels_per_center = {k: np.mean(v) for k, v in labels_per_center.items()}

labels_ratio_per_center = {}
for i in range(len(data)):
    center_id = center_ids[i]
    labels_ratio_per_center[center_id] = safe_append(labels_ratio_per_center.get(center_id),
                                                          labels[i]/data[i][1])
    
mean_labels_ratio_per_center = {k: np.mean(v) for k, v in labels_ratio_per_center.items()}
labels_ratio_scaled_per_center = []
for i in range(len(data)):
    center_id = center_ids[i]
    labels_ratio_scaled_per_center.append(labels_ratio[i]/mean_labels_ratio_per_center[center_id])
min_test_per_center = 200
good_indexes = []
for i in range(len(data)):
    center_id = center_ids[i]
    if tests_per_center[center_id] > min_test_per_center:
        good_indexes.append(i)
    
data = np.asarray([data[index] for index in good_indexes])
labels = np.asarray([labels[index] for index in good_indexes])
losses = np.asarray([losses[index] for index in good_indexes])
labels_ratio = np.asarray([labels_ratio[index] for index in good_indexes])
labels_ratio_scaled_per_center = np.asarray([labels_ratio_scaled_per_center[index] for index in good_indexes])
center_ids = np.asarray([center_ids[index] for index in good_indexes])

n = 4000
[data.shape, labels_ratio.shape]
{k:[v,tests_per_center.get(k)] for k,v in mean_labels_ratio_per_center.items() if tests_per_center.get(k) >= min_test_per_center}
Language:Python

train_data = lgb.Dataset(data[0:n], label=labels[0:n],
                         feature_name=my_features, categorical_feature=['genderNum', 'center_id'])
param = {'num_leaves':37,
        # 'max_depth':20,
   # 'min_gain_to_split': 1000,
        'num_trees':85, 
#          'min_data_in_leaf': 100,
#          'max_bin': 100,
         'objective':'regression'}
num_round = 30
bst = lgb.train(param, train_data, num_round)
predictions = bst.predict(data)#*losses

# for i in range(len(data)):
#     center_id = center_ids[i]
#     predictions[i] = predictions[i]*mean_labels_ratio_per_center[center_id]

delta = 5

print("training")
print(np.std(labels[0:n]-np.mean(labels)))
print(np.std(labels[0:n]-predictions[0:n]))
print(model_good_prediction_ratio_array(labels[0:n], np.mean(labels), delta))
print(model_good_prediction_ratio_array(labels[0:n], predictions[0:n], delta))

print("validation")
print(np.std(labels[n+1:]-np.mean(labels)))
print(np.std(labels[n+1:]-predictions[n+1:]))
print(model_good_prediction_ratio_array(labels[n+1:], np.mean(labels), delta))
print(model_good_prediction_ratio_array(labels[n+1:], predictions[n+1:], delta))
center_ids = my_equipment_success['center_id'].values
diff_per_center = {}
low_baseline_diff_per_center = {}
baseline_diff_per_center = {}
for i in range(0,n):#,len(data)):
    center_id = data[i][0]
    diff = (predictions - labels)[i]
    baseline_diff = labels[i] - mean_labels_per_center[center_id]
    low_baseline_diff = labels[i] - np.mean(labels)
    diff_per_center[center_id] = safe_append(diff_per_center.get(center_id), diff)
    baseline_diff_per_center[center_id] = safe_append(baseline_diff_per_center.get(center_id), baseline_diff)
    low_baseline_diff_per_center[center_id] = safe_append(low_baseline_diff_per_center.get(center_id), low_baseline_diff)
my_dictionary = {k: [np.std(v),
                     model_good_prediction_ratio_diffs_array(v, delta),
                     len(v)] for k, v in diff_per_center.items()}
good_centers = {k: 1 for k,v in my_dictionary.items() if v[1] >= 0}
good_centers = {k:v  for k,v in my_dictionary.items() if good_centers.get(k) }
[sum(list(map(lambda x: x[2], list(my_dictionary.values())))),
 sum(list(map(lambda x: x[2], list(good_centers.values()))))]
{k:v for k,v in mean_labels_ratio_per_center.items() if tests_per_center.get(k) >= 100}
print("validation on good centers")
validation_labels = labels[n+1:]
validation_predictions = predictions[n+1:]
validation_data = data[n+1:]
validataion_center_ids = center_ids[n+1:]



validation_predictions_good = []
validation_labels_good = []
for i in range(len(validation_data)):
    center_id = validataion_center_ids[i]
    if good_centers.get(center_id):
        validation_labels_good.append(validation_labels[i])
        validation_predictions_good.append(validation_predictions[i])

validation_labels_good = np.asarray(validation_labels_good)
validation_predictions_good = np.asarray(validation_predictions_good)


print(np.std(validation_labels_good - np.mean(labels)))
print(np.std(validation_labels_good - validation_predictions_good))
print(model_good_prediction_ratio_array(validation_labels_good, np.mean(labels), delta))
print(model_good_prediction_ratio_array(validation_labels_good, validation_predictions_good, delta))
my_dictionary = {k: [np.std(v),
                     model_good_prediction_ratio_diffs_array(v, delta),
                     len(v)] for k, v in baseline_diff_per_center.items()}
my_dictionary
my_dictionary = {k: [np.std(v),
                     model_good_prediction_ratio_diffs_array(v, delta),
                     len(v)] for k, v in low_baseline_diff_per_center.items()}
my_dictionary
predictions = bst.predict(data)*(my_equipment_success['freq_1000'].values)

delta = 5

print("training")
print(np.std(labels[0:n]-np.mean(labels)))
print(np.std(labels[0:n]-predictions[0:n]))
print(model_good_prediction_ratio_array(labels[0:n], np.mean(labels), delta))
print(model_good_prediction_ratio_array(labels[0:n], predictions[0:n], delta))

print("validation")
print(np.std(labels[n+1:]-np.mean(labels)))
print(np.std(labels[n+1:]-predictions[n+1:]))
print(model_good_prediction_ratio_array(labels[n+1:], np.mean(labels), delta))
print(model_good_prediction_ratio_array(labels[n+1:], predictions[n+1:], delta))

5.
Decision Tree

from sklearn import tree
import graphviz 
my_features = [
    'center_id',
   'decades_old',
 'genderNum',
#'freq_250',
 'freq_500',
# 'freq_750',
 'freq_1000',
# 'freq_1500',
 'freq_2000',
# 'freq_3000',
 'freq_4000',
# 'freq_6000',
 'freq_8000'
              ]
my_equipment_success = equipment_success.copy()


#my_equipment_success = my_equipment_success.query(' valid == True').copy()
my_equipment_success = my_equipment_success.sample(frac=1)
data = my_equipment_success[my_features].values
labels = (my_equipment_success['eq_freq_1000'].values)
[data.shape, labels.shape]

5.1.
Regression

n = 10000
clf = tree.DecisionTreeRegressor(#min_samples_leaf=2,
   max_depth=6
                                 # min_impurity_decrease=0.025
)
clf.fit(data[0:n], labels[0:n])
[clf.score(data[0:n], labels[0:n]), clf.score(data[n+1:], labels[n+1:])]
clf.feature_importances_
dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data) 
graph.render("idris")
predictions = clf.predict(data)
print("training")

print(np.std(labels[0:n]-np.mean(labels)))
print(np.std(labels[0:n]-predictions[0:n]))
print(model_good_prediction_ratio_array(labels[0:n], np.mean(labels), 5))
print(model_good_prediction_ratio_array(labels[0:n], predictions[0:n], 5))
print("validation")
print(np.std(labels[n+1:]-np.mean(labels)))
print(np.std(labels[n+1:]-predictions[n+1:]))
print(model_good_prediction_ratio_array(labels[n+1:], np.mean(labels), 5))
print(model_good_prediction_ratio_array(labels[n+1:], predictions[n+1:], 5))
from sklearn.ensemble import RandomForestRegressor
clf2 = RandomForestRegressor(n_estimators=40, max_features=7, max_depth=6) #, min_impurity_decrease=0.025)#,min_samples_leaf=10)
clf2.fit(data[0:n], labels[0:n])
[clf2.score(data[0:n], labels[0:n]), clf2.score(data[n+1:], labels[n+1:])]
clf2.get_params()
predictions = clf2.predict(data)
print("training")

print(np.std(labels[0:n]-np.mean(labels)))
print(np.std(labels[0:n]-predictions[0:n]))
print(model_good_prediction_ratio_array(labels[0:n], np.mean(labels), 5))
print(model_good_prediction_ratio_array(labels[0:n], predictions[0:n], 5))
print("validation")
print(np.std(labels[n+1:]-np.mean(labels)))
print(np.std(labels[n+1:]-predictions[n+1:]))
print(model_good_prediction_ratio_array(labels[n+1:], np.mean(labels), 5))
print(model_good_prediction_ratio_array(labels[n+1:], predictions[n+1:], 5))

5.2.
Classifier

def rounder(t): 
    return int(round(t / 5))


classes = np.array([rounder(xi) for xi in labels])
print(classes)
plt.hist(classes)
100*len(list(filter(lambda x : 10 < x < 60, labels)))/len(data)
data[0] = np.array([rounder(xi) for xi in data[0]])
data[0]
set(classes)
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(n_estimators=10, max_features=3)#,min_samples_leaf=10)
clf2.fit(data[0:4000], classes[0:4000])
[clf2.score(data[0:4000], classes[0:4000]), clf2.score(data[4000:], classes[4000:])]
proba = np.asarray(list(map(lambda x: np.max(x), clf2.predict_proba(data))))
good_indices = np.ndarray.nonzero(proba > 0.8)[0]
good_indices.shape
training_good_indices = list(filter(lambda x : x < 4000, good_indices))
validation_good_indices = list(filter(lambda x : 5000 > x >= 4000, good_indices))

[len(training_good_indices), len(validation_good_indices)]
[clf2.score(data[training_good_indices], classes[training_good_indices]), 
 clf2.score(data[validation_good_indices], classes[validation_good_indices])]
def classifier_good_prediction_ratio_array(targets, predictions):
    predictionsDiff = (targets == predictions)
    return len(list(filter(lambda x: x , predictionsDiff)))/len(predictionsDiff)
clf = tree.DecisionTreeClassifier(#min_samples_leaf=None,
                                  min_impurity_decrease=0.0004)
clf.fit(data[0:4000], classes[0:4000])
[clf.score(data[0:4000], classes[0:4000]), clf.score(data[4000:], classes[4000:])]
proba = np.asarray(list(map(lambda x: np.max(x), clf.predict_proba(data))))
good_indices = np.ndarray.nonzero(proba > 0.8)[0]
good_indices.shape
training_good_indices = list(filter(lambda x : x < 4000, good_indices))
validation_good_indices = list(filter(lambda x : 5000 > x >= 4000, good_indices))

[len(training_good_indices), len(validation_good_indices)]
dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data) 
graph.render("idris")
my_features
clf.feature_importances_
proba = np.asarray(list(map(lambda x: np.max(x), clf.predict_proba(data))))
good_indices = np.ndarray.nonzero(proba > 0.8)[0]
good_indices.shape
training_good_indices = list(filter(lambda x : x < 4000, good_indices))
validation_good_indices = list(filter(lambda x : 5000 > x >= 4000, good_indices))

[len(training_good_indices), len(validation_good_indices)]
[clf.score(data[training_good_indices], classes[training_good_indices]), 
 clf.score(data[validation_good_indices], classes[validation_good_indices])]
relevant_labels = classes[0:4000]
[classifier_good_prediction_ratio_array(relevant_labels, int(np.average(relevant_labels))), 
  clf.score(data[0:4000], classes[0:4000])]
relevant_labels = classes[4000:]
[classifier_good_prediction_ratio_array(relevant_labels, int(np.average(relevant_labels))), 
  clf.score(data[4000:], classes[4000:])]
i = validation_good_indices[1]
print(i)
print(clf.predict_proba(data[[i]]))
classes[i]
[labels[4042],
data[4042]]
list(map(lambda x: [x, rounder(x)], range(1, 100)))

6.
Neural Network

def good_prediction_error(y_true, y_pred):
    g = tf.subtract(y_true, y_pred)
    g = tf.cast(g < 5.0, tf.float32)
    return g
    
    
#good_prediction_error(np.array([1, 20]), np.array([0, 0]))
features
eq = equipment_success.sample(frac=1)
data = (equipment_success[features].values)
labels = (equipment_success['eq_freq_1000'].values)
data = original_data.copy()
np.random.shuffle(data)
data.shape

model = Sequential([
    Dense(300, input_dim=len(features)),
    advanced_activations.LeakyReLU(alpha=0.3),
    Dense(100),
    advanced_activations.LeakyReLU(alpha=0.3),
    Dense(100),
    advanced_activations.LeakyReLU(alpha=0.3),
    Dense(1),
])


model.compile(optimizer=optimizers.Adam(lr=0.01),
              loss='mean_squared_error',       
              metrics=[ 'mean_squared_error'])  

validation_split=0.2
training_samples = int(len(data)*(1-validation_split))
history = model.fit(data, labels, epochs=200, steps_per_epoch=100,
                   validation_split=validation_split, 
                    validation_steps=100,
                    verbose=1)
from operator import itemgetter
# ע
a, b = min(enumerate(history.history['mean_squared_error']), key=itemgetter(1)) 
[a, math.sqrt(b)]

relevant_data = data[0:training_samples]
relevant_labels = labels[0:training_samples]
predictions = model.predict(relevant_data).transpose()[0]
[model_good_prediction_ratio_array(relevant_labels, predictions, 5),
 model_good_prediction_ratio_array(relevant_labels, np.average(relevant_labels), 5),
 np.std(relevant_labels - predictions),
 np.std(relevant_labels - np.average(relevant_labels))
]
relevant_data = data[training_samples:]
relevant_labels = labels[training_samples:]
predictions = model.predict(relevant_data).transpose()[0]
[model_good_prediction_ratio_array(relevant_labels, predictions, 5),
 model_good_prediction_ratio_array(relevant_labels, np.average(relevant_labels), 5),
 np.std(relevant_labels - predictions),
 np.std(relevant_labels - np.average(relevant_labels))
]

# Plot training & validation loss values
plt.plot(history.history['mean_squared_error'])
plt.plot(history.history['val_mean_squared_error'])
plt.title('Model mean_squared_error')
plt.ylabel('mean_squared_error')
plt.ylim(0, 100)
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
plt.show()
test_dataset = equipment_success_all.copy() #[(equipment_success_valid['isMale'] == 0)
                                           # & (equipment_success_valid['decades_old'] == 7)
                                           #].copy()
        
test_dataset = test_dataset.sample(frac=1)
test_dataset = test_dataset.tail(1000)
test_data = (test_dataset[features].values)
test_labels = (test_dataset['eq_freq_2000'].values)

test_predictions = model.predict(test_data).transpose()[0]
[model_good_prediction_ratio_array(test_labels, test_predictions, 5),
 model_good_prediction_ratio_array(test_labels, np.average(test_labels), 5)]
test_labels - test_predictions