diff --git a/datamodel.py b/datamodel.py new file mode 100644 index 0000000000000000000000000000000000000000..61b8507a2617f2fc22af62f1ea8b5e07b464eb70 --- /dev/null +++ b/datamodel.py @@ -0,0 +1,107 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder +from sklearn.preprocessing import MinMaxScaler +from sklearn.feature_selection import VarianceThreshold + +class datamodel: + ''' + Used for pre-processing. + ''' + + def __init__(self,x_train,y_train,x_test,y_test,target_col,pos_class,neg_class,sensitive_col_name, sensitive_val1,sensitive_val2,sensitive_train_col,sensitive_test_col): + self.x_train=x_train + self.x_test=x_test + self.y_train=y_train + self.y_test=y_test + self.target_col=target_col + self.pos_class=pos_class + self.neg_class=neg_class + #the sensitive data is tracked by remembering the sensitive column + self.sensitive_col_name=sensitive_col_name + self.sensitive_train_col=sensitive_train_col + self.sensitive_test_col=sensitive_test_col + self.sensitive_val1=sensitive_val1 + self.sensitive_val2=sensitive_val2 + + def encode_columns(self,ordinal_cols,nominal_cols): + if (ordinal_cols!=[]): + encoder = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1) + self.x_train[ordinal_cols]=encoder.fit_transform(self.x_train[ordinal_cols]) + self.x_test[ordinal_cols]=encoder.transform(self.x_test[ordinal_cols]) + if (nominal_cols!=[]): + encoder = OneHotEncoder(handle_unknown='ignore') + nominal_x_train=encoder.fit_transform(self.x_train[nominal_cols]).toarray() + nominal_x_test = encoder.transform(self.x_test[nominal_cols]).toarray() + categories = np.concatenate(encoder.categories_) + self.x_train[categories]=nominal_x_train + self.x_train = self.x_train.drop(columns=nominal_cols) + self.x_test[categories]=nominal_x_test + self.x_test = self.x_test.drop(columns=nominal_cols) + + + def remove_sensitive_feature(self): + if self.sensitive_col_name in self.x_train.columns: + self.x_train =self.x_train.drop(columns=self.sensitive_col_name) + self.x_test = self.x_test.drop(columns=self.sensitive_col_name) + + + + def feature_scaler(self): + scaler = MinMaxScaler() + self.x_train[self.x_train.columns] = scaler.fit_transform(self.x_train) + self.x_test[self.x_test.columns] = scaler.transform(self.x_test) + + + def select_features_by_variance(self,binaries): + ''' + Binary attributes that consist of 80% zeros or ones are removed + ''' + sel = VarianceThreshold(threshold= .8 * (1 - .8)) + non_binaries = list(set(self.x_train.columns)-set(binaries)) + sel_x_train=sel.fit_transform(self.x_train[binaries]) + sel_x_train = pd.DataFrame(sel_x_train,columns=sel.get_feature_names_out(),index=self.x_train.index) + sel_x_test = pd.DataFrame(sel.transform(self.x_test[binaries]),columns=sel.get_feature_names_out(),index=self.x_test.index) + self.x_train = pd.concat([sel_x_train,self.x_train[non_binaries]],axis=1) + self.x_test = pd.concat([sel_x_test,self.x_test[non_binaries]],axis=1) + + def get_sensitive_train_data(self): + ''' + Used to extract the data for pre-processing of group-specific classifiers + ''' + sens_1_x_train = self.x_train[self.sensitive_train_col==self.sensitive_val1] + sens_1_y_train = self.y_train[self.sensitive_train_col==self.sensitive_val1] + sens_2_x_train = self.x_train[self.sensitive_train_col==self.sensitive_val2] + sens_2_y_train = self.y_train[self.sensitive_train_col==self.sensitive_val2] + return sens_1_x_train,sens_1_y_train,sens_2_x_train,sens_2_y_train + + def get_sensitive_test_data(self): + ''' + Used to extract the data group-specific classifiers + group-filtered pre-processed data of the single classifier + ''' + sens_1_x_test = self.x_test[self.sensitive_test_col==self.sensitive_val1] + sens_1_y_test = self.y_test[self.sensitive_test_col==self.sensitive_val1] + sens_2_x_test = self.x_test[self.sensitive_test_col==self.sensitive_val2] + sens_2_y_test = self.y_test[self.sensitive_test_col==self.sensitive_val2] + return sens_1_x_test,sens_1_y_test,sens_2_x_test,sens_2_y_test + + + def create_contingency_table(self,attr_label1,attr_label2): + ''' + Visualises the class and group distribution + ''' + train_data = pd.concat([self.x_train,self.y_train],axis=1) + pos_class = train_data.query(''+self.target_col+'== @self.pos_class').shape[0] + neg_class = train_data.query(''+self.target_col+'== @self.neg_class').shape[0] + val1_total = train_data.query(''+self.sensitive_col_name+'== @self.sensitive_val1').shape[0] + val2_total = train_data.query(''+self.sensitive_col_name+'== @self.sensitive_val2').shape[0] + val1_pos = train_data.query(''+self.target_col+'== @self.pos_class and '+self.sensitive_col_name+'== @self.sensitive_val1').shape[0] + val1_neg = train_data.query(''+self.target_col+'== @self.neg_class and '+self.sensitive_col_name+'== @self.sensitive_val1').shape[0] + val2_pos = train_data.query(''+self.target_col+'== @self.pos_class and '+self.sensitive_col_name+'== @self.sensitive_val2').shape[0] + val2_neg = train_data.query(''+self.target_col+'== @self.neg_class and '+self.sensitive_col_name+'== @self.sensitive_val2').shape[0] + total = train_data.shape[0] + contingency_table = pd.DataFrame(index=[attr_label1,attr_label2,'total'],data={ + 'positive class':[val1_pos/total,val2_pos/total,pos_class/total], + 'negative class':[val1_neg/total,val2_neg/total,neg_class/total], + 'total':[val1_total/total,val2_total/total,(val1_total+val2_total)/total]}) + return contingency_table \ No newline at end of file