diff --git a/code/analisis_resultados.py b/code/analisis_resultados.py new file mode 100644 index 0000000000000000000000000000000000000000..d73e1833375dbc5b6b3403307eac23466fad8415 --- /dev/null +++ b/code/analisis_resultados.py @@ -0,0 +1,100 @@ +import pickle +import numpy as np +import load_saved_models as lsm +import time +import tqdm +import pandas as pd +import os + + +DATA_PATH = os.path.abspath(os.path.join(__file__, '../../results/regression')) + + +def get_bests(var_names = ['Mn(kg·mol)-1', 'MwMn', 'conversion']): + bests = {} + for var_name in var_names: + bests[var_name] = {} + all = pd.read_excel(os.path.join(DATA_PATH, f'{var_name}_mean.xlsx'), index_col=0) + for column in all.columns[0::2]: + results = all[column] + indexes = np.array(list(all.index)) + order = np.argsort(results.values)[::-1] + bests[var_name][column.replace('(mean)', '')] = list(indexes[order[:3]]) + + return bests + +def save_bests_excel(var_name, data): + # data in format LR: {'RAW_PCA'} + print(data) + excel_data = {'Preprocessing': [f"{l}{k}".replace(var_name, '') for k in data for l in data[k]], + 'MAE': [data[k][l][1] for k in data for l in data[k]], + 'Model': ['LR', 'LR', 'LR', 'SVR', 'SVR', 'SVR', 'RF', 'RF', 'RF']} + + df = pd.DataFrame(excel_data) + df.set_index('Preprocessing', inplace=True) + df.to_excel(f'{var_name}_bests.xlsx') + + +if __name__ == "__main__": + + var_names = ['Mn (kg·mol)-1', 'MwMn', 'conversion'] + dict_model = {} + bests_dict = {} + seeds = [645, 4578, 72, 2365, 90345, 24, 1859, 1334, 2078, 2446, 7409, 6995, 2041, 449, 9475] + + all_bests = get_bests() + + # Iterate over all the best models and create datasets + for var in var_names: + bests = all_bests[var.replace(' ', '')] + bests_dict[var] = {} + for best_key in bests: + bests_dict[var][best_key] = {} + for name in bests[best_key]: + print(var, best_key, name) + yhat_val = [] + maes_val = [] + mapes_val = [] + model_varname = f'{name}_{var}' + model_type = 'linear' if best_key == 'LR' else ('forest' if best_key == 'RF' else 'svr') + + for seed in tqdm.tqdm(seeds): + data, ensemble = lsm.load_ensemble(f'{name}_{var}', f'./results/models/{seed}/{name}_{var}_{model_type}.pkl', seed) + y_hat = ensemble.predict_ensemble(data[0]) + mae, mape = ensemble.compute_mae_mape_ensemble(data[0], data[1]) + yhat_val.append(y_hat) + maes_val.append(mae) + mapes_val.append(mape) + + bests_dict[var][best_key][model_varname] = [np.mean(yhat_val), np.mean(maes_val), np.mean(mapes_val)] + + save_bests_excel(var, bests_dict[var]) + + pickle.dump(bests_dict, open('all_bests.pkl', 'wb')) + + # Iterate over all the best RF models and save the data + best_rf = [all_bests[v.replace(' ','')]['RF'] for v in var_names] + + for var, best_model in zip(var_names, best_rf): + for name in best_model: + yhat_val = [] + maes_val = [] + mapes_val = [] + model_varname = f'{name}_{var}' + time_t = [] + + for seed in tqdm.tqdm(seeds): + data, ensemble = lsm.load_ensemble(f'{name}_{var}', f'./results/models/{seed}/{name}_{var}_forest.pkl',seed) + start_time = time.time() + y_hat = ensemble.predict_ensemble(data[0]) + end_time = time.time() + execution_time = end_time - start_time + time_t.append(execution_time) + mae, mape = ensemble.compute_mae_mape(data[0], data[1]) + yhat_val.append(y_hat) + maes_val.append(mae) + mapes_val.append(mape) + + dict_model[model_varname] = [np.mean(yhat_val, axis = 0),np.mean(maes_val, axis = 0),np.mean(mapes_val, axis = 0),np.mean(time_t)] + + pickle.dump([dict_model],open('resolution_%s.pkl' %(var), 'wb')) \ No newline at end of file diff --git a/code/feature_selection.py b/code/feature_selection.py index 376b0ceef55eba80a82f38aefcfb1d73e9e055b8..6041277dbb100432dc6dde8ed10da4dc8c498746 100644 --- a/code/feature_selection.py +++ b/code/feature_selection.py @@ -1,13 +1,18 @@ import numpy as np import pandas as pd +import pickle as pkl import seaborn as sns import matplotlib.pyplot as plt # Machine learning import xgboost as xgb from sklearn.tree import DecisionTreeRegressor +from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestRegressor -from sklearn.feature_selection import SelectKBest, f_regression, RFE +from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE + +# User imports +from prebio_df import read_imputed DELETE_PERC = 90 @@ -15,14 +20,14 @@ DELETE_PERC = 90 # ELBOW METHOD def find_stable_index(metric_values: np.array, tolerance: float)-> float: """ - Finds the index where a metric stabilizes in an array of values. + Encuentra el índice donde una métrica se estabiliza en un array de valores. - Parameters: - - metric_values: A list or array of metric values. - - tolerance: The maximum allowed difference between successive values. + Parámetros: + - metric_values: Una lista o array de valores de la métrica. + - tolerance: La diferencia máxima permitida entre valores sucesivos. - Returns: - - The index where the metric stabilizes or None if not found. + Retorna: + - El índice donde la métrica se estabiliza o None si no se encuentra. """ for i in range(1, len(metric_values)): if abs(metric_values[i] - metric_values[i - 1]) <= tolerance: @@ -145,4 +150,43 @@ def random_forest_selector(x: pd.DataFrame, y: pd.DataFrame, tolerance=0.01, **k plt.show() return results - \ No newline at end of file + +def print_distribution(dataframe, name, delete_indexes): + outlier_df = dataframe.copy(deep=True) + outlier_df['outlier'] = [True if i in delete_indexes else False for i in range(outlier_df.shape[0])] + sns.displot(data=outlier_df, x=name, hue='outlier', palette= "blend:#7AB,#E06B53", **dict(edgecolor="white", linewidth=0)) + sns.despine() + plt.tight_layout() + plt.savefig(f"./results/feature_extraction/{name.replace('/', '_')}_outliers.png", transparent=False) + + +def main(): + dependent_vars = ['Mn (kg·mol)-1', 'Mw/Mn', 'conversion'] + reduced_path = './data/reduced_df.pickle' + + df = read_imputed().select_dtypes('number') + reduced_dfs = {} + + for name in dependent_vars: + print(f"--- Processing {name.capitalize()} ---") + df_name = df.copy(deep=True) + x, y = df[name] + + results = q1_feature_selection(x, y, DELETE_PERC, stats=name) + # Descomentar para usar RFE + #results = rfe_selection(x, y, DELETE_PERC, stats=name) + #results = xgboost_selection(x, y, DELETE_PERC) + result = random_forest_selector(x, y) + + #print(list(results.index)) + # + #x = x[list(results.index)] + # + ## Save + #x.reset_index(inplace=True, drop=True) + #y.reset_index(inplace=True, drop=True) + + +if __name__ == "__main__": + main() + \ No newline at end of file diff --git a/code/load_saved_models.py b/code/load_saved_models.py index 77dac6aaf4fa5658acd516c8d34b6cde801fbeda..f80c6e02c700ffc7f20d1a4a77d2999858073056 100644 --- a/code/load_saved_models.py +++ b/code/load_saved_models.py @@ -81,6 +81,5 @@ def load_ensemble(data_name: str, model_data_path: str, seed: int) -> Tuple[pd.D """ data_all = pickle.load(open(f'data/all_processed_{seed}.pkl','rb')) ensemble = Ensemble(pickle.load(open(model_data_path, 'rb'))) - #print(data_name) - #print(model_data_path) - return data_all[data_name], ensemble + + return data_all[data_name], ensemble \ No newline at end of file diff --git a/code/missing_imputation.py b/code/missing_imputation.py new file mode 100644 index 0000000000000000000000000000000000000000..7429ec685ffc17fa47ad7be1c8bf2c06a683c2f9 --- /dev/null +++ b/code/missing_imputation.py @@ -0,0 +1,88 @@ +import click +import numpy as np +import pandas as pd +from sklearn.impute import KNNImputer + +from prebio_df import read_prebio +from nan_analysis import missing_values_table + + +def impute_missing(target: pd.Series, set: pd.DataFrame) -> pd.DataFrame: + """Impute missing values in the dataset using KNN imputation. + + Args: + target (pd.Series): Target row with missing values to be imputed. + set (pd.DataFrame): Dataset containing the target row and other rows for imputation. + + Returns: + pd.DataFrame: Dataset with missing values imputed using KNN imputation. + """ + set.loc[len(set)] = target + imputer = KNNImputer() + res = imputer.fit_transform(set) + + return pd.DataFrame(res, columns = set.columns) + + +def check_complete_samples(df: pd.DataFrame) -> tuple: + """Find all samples with 0% of missing values + + Args: + df (pd.DataFrame): Input dataframe + + Returns: + tuple: A tuple containing the indexes of samples with 0% missing values (empty if none), + an array of NaN counts for each row, and the unique counts of NaNs. + """ + nans = np.array([r.isnull().sum() for _, r in df.iterrows()]) + indexes = np.where(nans == 0)[0] + print(f'> The amount of instances with 0% of missing values is {len(indexes)}!') + unique, counts = np.unique(nans, return_counts=True) + print(dict(zip(unique, counts))) + + return indexes, nans, unique + + +@click.command +@click.option('--ds', default='../data/polycarbonates_entregable.xlsx', help='Path to prebio2 dataset (.xlsx)') +def main(ds): + """Main function to preprocess the dataset by imputing missing values and storing the result. + + Args: + ds (str): Path to the prebio2 dataset (.xlsx). + """ + df = read_prebio(ds) + missing_table = missing_values_table(df) + # print(missing_table) + + # We drop columns with > 70% of data missing. + more_than_70_labels = missing_table[missing_table['% of Total Values'] > 70].index + df70 = df.drop(columns = more_than_70_labels) + missing_values_table(df70, title="------ Columns dropped ------") + + # Get all the samples with a 0% of missing values + indexes, nans, unique = check_complete_samples(df70) + full_df = df70.iloc[indexes] + + # For ease imputing new data we delete categorical columns (None of them contains NaNs) + full_df = full_df.drop(columns=['Epoxide', 'SMILES', 'Catalyst']) + + # Assert the new dataset is empty + assert missing_values_table(full_df).empty, "This shouldn't be happening. Your full_df has empty values!" + + # Impute missing data sequentially + for min_nan in np.sort(unique)[1:]: # We omit taking samples with 0% of missing values as they will be repeated + for index in np.where(nans == min_nan)[0]: + full_df = impute_missing(df70.iloc[index], full_df) + + df70.update(full_df) + missing_values_table(df70, title='------ Imputed DataFrame ------') + + path = '../data/imputed_polycarbonates.xlsx' + print(f'The new dataset will be stored in {path}') + df70.to_excel(path) + + +if __name__ == "__main__": + main() + \ No newline at end of file diff --git a/code/nan_analysis.py b/code/nan_analysis.py index bc4d58020d25f5780b9b19374c506efc213b090c..05ba89d7d971fd51918d73ff24130e3669d826ed 100644 --- a/code/nan_analysis.py +++ b/code/nan_analysis.py @@ -1,6 +1,8 @@ +import click import numpy as np import pandas as pd +from prebio_df import read_prebio def missing_values_table(df: pd.DataFrame, title:str=None) -> pd.DataFrame: """Generate a table of missing values in the dataframe. @@ -61,3 +63,36 @@ def get_nans_per_row(df) -> pd.DataFrame: new_data.loc[len(new_data)] = new_row return new_data + + +def generate_images(df): + import seaborn as sns + import seaborn_image as snsi + import matplotlib.pyplot as plt + + res = df.isnull() + snsi.imgplot(res, gray=True) + plt.tight_layout() + plt.savefig('NaN.png') + plt.close() + + res = missing_values_table(df) + res_len = res.shape[0] + res.reset_index(inplace=True) + res = res.rename(columns = {'index': f'Feature ID ({res_len} columns)'}) + res = res.drop(columns=['% of Total Values']) + + sns.barplot(res, x= f'Feature ID ({res_len} columns)', y='Missing Values') + sns.despine() + plt.tick_params( + axis='x', # changes apply to the x-axis + which='both', # both major and minor ticks are affected + bottom=False, # ticks along the bottom edge are off + top=False, # ticks along the top edge are off + labelbottom=False + ) + + plt.tight_layout() + plt.savefig('NaNPerColumn.png', transparent=True) + plt.close() + \ No newline at end of file diff --git a/code/prebio_df.py b/code/prebio_df.py index 3035477424d39e310e5ce90326f50da344f736a1..80ddbe0e4b89ff4e1d5731b9b73a48a72cdb3c6d 100644 --- a/code/prebio_df.py +++ b/code/prebio_df.py @@ -65,4 +65,4 @@ def load_split_df(path: str = './data/imputed_polycarbonates.xlsx') -> tuple: def load_reduced_df() -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]: with open('./data/reduced_df.pickle', 'rb') as f: - return pkl.load(f) \ No newline at end of file + return pkl.load(f) diff --git a/code/preprocessing.py b/code/preprocessing.py index e5d0020dd3fcfec994f59efb0bdbbdd57bf2fc9a..c04788d77bab9a3003de17c44d831f4c143f3177 100644 --- a/code/preprocessing.py +++ b/code/preprocessing.py @@ -1,6 +1,7 @@ import click import numpy as np import pandas as pd +from scipy import stats from sklearn.impute import KNNImputer from prebio_df import read_prebio diff --git a/code/results_manager.py b/code/results_manager.py index 23873a5335060117faaff91c4eb2b46b4f439a4b..6582703eb99998578756b0bfe8feee95c366caf7 100644 --- a/code/results_manager.py +++ b/code/results_manager.py @@ -7,7 +7,7 @@ from typing import Any class DataProvider: def __init__(self) -> None: - self.data = {} # Format: Name: Dict({var_name: [x, y]}) + self.data = {} # Formato: Nombre: Dict({nombre_variable: [x, y]}) @property def ids(self):