Skip to content
Snippets Groups Projects
Commit 70138bf6 authored by Aritz's avatar Aritz
Browse files

Code Update

parent f94bf0e5
Branches
No related tags found
No related merge requests found
import pickle
import numpy as np
import load_saved_models as lsm
import time
import tqdm
import pandas as pd
import os
DATA_PATH = os.path.abspath(os.path.join(__file__, '../../results/regression'))
def get_bests(var_names = ['Mn(kg·mol)-1', 'MwMn', 'conversion']):
bests = {}
for var_name in var_names:
bests[var_name] = {}
all = pd.read_excel(os.path.join(DATA_PATH, f'{var_name}_mean.xlsx'), index_col=0)
for column in all.columns[0::2]:
results = all[column]
indexes = np.array(list(all.index))
order = np.argsort(results.values)[::-1]
bests[var_name][column.replace('(mean)', '')] = list(indexes[order[:3]])
return bests
def save_bests_excel(var_name, data):
# data in format LR: {'RAW_PCA'}
print(data)
excel_data = {'Preprocessing': [f"{l}{k}".replace(var_name, '') for k in data for l in data[k]],
'MAE': [data[k][l][1] for k in data for l in data[k]],
'Model': ['LR', 'LR', 'LR', 'SVR', 'SVR', 'SVR', 'RF', 'RF', 'RF']}
df = pd.DataFrame(excel_data)
df.set_index('Preprocessing', inplace=True)
df.to_excel(f'{var_name}_bests.xlsx')
if __name__ == "__main__":
var_names = ['Mn (kg·mol)-1', 'MwMn', 'conversion']
dict_model = {}
bests_dict = {}
seeds = [645, 4578, 72, 2365, 90345, 24, 1859, 1334, 2078, 2446, 7409, 6995, 2041, 449, 9475]
all_bests = get_bests()
# Iterate over all the best models and create datasets
for var in var_names:
bests = all_bests[var.replace(' ', '')]
bests_dict[var] = {}
for best_key in bests:
bests_dict[var][best_key] = {}
for name in bests[best_key]:
print(var, best_key, name)
yhat_val = []
maes_val = []
mapes_val = []
model_varname = f'{name}_{var}'
model_type = 'linear' if best_key == 'LR' else ('forest' if best_key == 'RF' else 'svr')
for seed in tqdm.tqdm(seeds):
data, ensemble = lsm.load_ensemble(f'{name}_{var}', f'./results/models/{seed}/{name}_{var}_{model_type}.pkl', seed)
y_hat = ensemble.predict_ensemble(data[0])
mae, mape = ensemble.compute_mae_mape_ensemble(data[0], data[1])
yhat_val.append(y_hat)
maes_val.append(mae)
mapes_val.append(mape)
bests_dict[var][best_key][model_varname] = [np.mean(yhat_val), np.mean(maes_val), np.mean(mapes_val)]
save_bests_excel(var, bests_dict[var])
pickle.dump(bests_dict, open('all_bests.pkl', 'wb'))
# Iterate over all the best RF models and save the data
best_rf = [all_bests[v.replace(' ','')]['RF'] for v in var_names]
for var, best_model in zip(var_names, best_rf):
for name in best_model:
yhat_val = []
maes_val = []
mapes_val = []
model_varname = f'{name}_{var}'
time_t = []
for seed in tqdm.tqdm(seeds):
data, ensemble = lsm.load_ensemble(f'{name}_{var}', f'./results/models/{seed}/{name}_{var}_forest.pkl',seed)
start_time = time.time()
y_hat = ensemble.predict_ensemble(data[0])
end_time = time.time()
execution_time = end_time - start_time
time_t.append(execution_time)
mae, mape = ensemble.compute_mae_mape(data[0], data[1])
yhat_val.append(y_hat)
maes_val.append(mae)
mapes_val.append(mape)
dict_model[model_varname] = [np.mean(yhat_val, axis = 0),np.mean(maes_val, axis = 0),np.mean(mapes_val, axis = 0),np.mean(time_t)]
pickle.dump([dict_model],open('resolution_%s.pkl' %(var), 'wb'))
\ No newline at end of file
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import pickle as pkl
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
# Machine learning # Machine learning
import xgboost as xgb import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression, RFE from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE
# User imports
from prebio_df import read_imputed
DELETE_PERC = 90 DELETE_PERC = 90
...@@ -15,14 +20,14 @@ DELETE_PERC = 90 ...@@ -15,14 +20,14 @@ DELETE_PERC = 90
# ELBOW METHOD # ELBOW METHOD
def find_stable_index(metric_values: np.array, tolerance: float)-> float: def find_stable_index(metric_values: np.array, tolerance: float)-> float:
""" """
Finds the index where a metric stabilizes in an array of values. Encuentra el índice donde una métrica se estabiliza en un array de valores.
Parameters: Parámetros:
- metric_values: A list or array of metric values. - metric_values: Una lista o array de valores de la métrica.
- tolerance: The maximum allowed difference between successive values. - tolerance: La diferencia máxima permitida entre valores sucesivos.
Returns: Retorna:
- The index where the metric stabilizes or None if not found. - El índice donde la métrica se estabiliza o None si no se encuentra.
""" """
for i in range(1, len(metric_values)): for i in range(1, len(metric_values)):
if abs(metric_values[i] - metric_values[i - 1]) <= tolerance: if abs(metric_values[i] - metric_values[i - 1]) <= tolerance:
...@@ -145,4 +150,43 @@ def random_forest_selector(x: pd.DataFrame, y: pd.DataFrame, tolerance=0.01, **k ...@@ -145,4 +150,43 @@ def random_forest_selector(x: pd.DataFrame, y: pd.DataFrame, tolerance=0.01, **k
plt.show() plt.show()
return results return results
\ No newline at end of file def print_distribution(dataframe, name, delete_indexes):
outlier_df = dataframe.copy(deep=True)
outlier_df['outlier'] = [True if i in delete_indexes else False for i in range(outlier_df.shape[0])]
sns.displot(data=outlier_df, x=name, hue='outlier', palette= "blend:#7AB,#E06B53", **dict(edgecolor="white", linewidth=0))
sns.despine()
plt.tight_layout()
plt.savefig(f"./results/feature_extraction/{name.replace('/', '_')}_outliers.png", transparent=False)
def main():
dependent_vars = ['Mn (kg·mol)-1', 'Mw/Mn', 'conversion']
reduced_path = './data/reduced_df.pickle'
df = read_imputed().select_dtypes('number')
reduced_dfs = {}
for name in dependent_vars:
print(f"--- Processing {name.capitalize()} ---")
df_name = df.copy(deep=True)
x, y = df[name]
results = q1_feature_selection(x, y, DELETE_PERC, stats=name)
# Descomentar para usar RFE
#results = rfe_selection(x, y, DELETE_PERC, stats=name)
#results = xgboost_selection(x, y, DELETE_PERC)
result = random_forest_selector(x, y)
#print(list(results.index))
#
#x = x[list(results.index)]
#
## Save
#x.reset_index(inplace=True, drop=True)
#y.reset_index(inplace=True, drop=True)
if __name__ == "__main__":
main()
\ No newline at end of file
...@@ -81,6 +81,5 @@ def load_ensemble(data_name: str, model_data_path: str, seed: int) -> Tuple[pd.D ...@@ -81,6 +81,5 @@ def load_ensemble(data_name: str, model_data_path: str, seed: int) -> Tuple[pd.D
""" """
data_all = pickle.load(open(f'data/all_processed_{seed}.pkl','rb')) data_all = pickle.load(open(f'data/all_processed_{seed}.pkl','rb'))
ensemble = Ensemble(pickle.load(open(model_data_path, 'rb'))) ensemble = Ensemble(pickle.load(open(model_data_path, 'rb')))
#print(data_name)
#print(model_data_path) return data_all[data_name], ensemble
return data_all[data_name], ensemble \ No newline at end of file
import click
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from prebio_df import read_prebio
from nan_analysis import missing_values_table
def impute_missing(target: pd.Series, set: pd.DataFrame) -> pd.DataFrame:
"""Impute missing values in the dataset using KNN imputation.
Args:
target (pd.Series): Target row with missing values to be imputed.
set (pd.DataFrame): Dataset containing the target row and other rows for imputation.
Returns:
pd.DataFrame: Dataset with missing values imputed using KNN imputation.
"""
set.loc[len(set)] = target
imputer = KNNImputer()
res = imputer.fit_transform(set)
return pd.DataFrame(res, columns = set.columns)
def check_complete_samples(df: pd.DataFrame) -> tuple:
"""Find all samples with 0% of missing values
Args:
df (pd.DataFrame): Input dataframe
Returns:
tuple: A tuple containing the indexes of samples with 0% missing values (empty if none),
an array of NaN counts for each row, and the unique counts of NaNs.
"""
nans = np.array([r.isnull().sum() for _, r in df.iterrows()])
indexes = np.where(nans == 0)[0]
print(f'> The amount of instances with 0% of missing values is {len(indexes)}!')
unique, counts = np.unique(nans, return_counts=True)
print(dict(zip(unique, counts)))
return indexes, nans, unique
@click.command
@click.option('--ds', default='../data/polycarbonates_entregable.xlsx', help='Path to prebio2 dataset (.xlsx)')
def main(ds):
"""Main function to preprocess the dataset by imputing missing values and storing the result.
Args:
ds (str): Path to the prebio2 dataset (.xlsx).
"""
df = read_prebio(ds)
missing_table = missing_values_table(df)
# print(missing_table)
# We drop columns with > 70% of data missing.
more_than_70_labels = missing_table[missing_table['% of Total Values'] > 70].index
df70 = df.drop(columns = more_than_70_labels)
missing_values_table(df70, title="------ Columns dropped ------")
# Get all the samples with a 0% of missing values
indexes, nans, unique = check_complete_samples(df70)
full_df = df70.iloc[indexes]
# For ease imputing new data we delete categorical columns (None of them contains NaNs)
full_df = full_df.drop(columns=['Epoxide', 'SMILES', 'Catalyst'])
# Assert the new dataset is empty
assert missing_values_table(full_df).empty, "This shouldn't be happening. Your full_df has empty values!"
# Impute missing data sequentially
for min_nan in np.sort(unique)[1:]: # We omit taking samples with 0% of missing values as they will be repeated
for index in np.where(nans == min_nan)[0]:
full_df = impute_missing(df70.iloc[index], full_df)
df70.update(full_df)
missing_values_table(df70, title='------ Imputed DataFrame ------')
path = '../data/imputed_polycarbonates.xlsx'
print(f'The new dataset will be stored in {path}')
df70.to_excel(path)
if __name__ == "__main__":
main()
\ No newline at end of file
import click
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from prebio_df import read_prebio
def missing_values_table(df: pd.DataFrame, title:str=None) -> pd.DataFrame: def missing_values_table(df: pd.DataFrame, title:str=None) -> pd.DataFrame:
"""Generate a table of missing values in the dataframe. """Generate a table of missing values in the dataframe.
...@@ -61,3 +63,36 @@ def get_nans_per_row(df) -> pd.DataFrame: ...@@ -61,3 +63,36 @@ def get_nans_per_row(df) -> pd.DataFrame:
new_data.loc[len(new_data)] = new_row new_data.loc[len(new_data)] = new_row
return new_data return new_data
def generate_images(df):
import seaborn as sns
import seaborn_image as snsi
import matplotlib.pyplot as plt
res = df.isnull()
snsi.imgplot(res, gray=True)
plt.tight_layout()
plt.savefig('NaN.png')
plt.close()
res = missing_values_table(df)
res_len = res.shape[0]
res.reset_index(inplace=True)
res = res.rename(columns = {'index': f'Feature ID ({res_len} columns)'})
res = res.drop(columns=['% of Total Values'])
sns.barplot(res, x= f'Feature ID ({res_len} columns)', y='Missing Values')
sns.despine()
plt.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom=False, # ticks along the bottom edge are off
top=False, # ticks along the top edge are off
labelbottom=False
)
plt.tight_layout()
plt.savefig('NaNPerColumn.png', transparent=True)
plt.close()
\ No newline at end of file
...@@ -65,4 +65,4 @@ def load_split_df(path: str = './data/imputed_polycarbonates.xlsx') -> tuple: ...@@ -65,4 +65,4 @@ def load_split_df(path: str = './data/imputed_polycarbonates.xlsx') -> tuple:
def load_reduced_df() -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]: def load_reduced_df() -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]:
with open('./data/reduced_df.pickle', 'rb') as f: with open('./data/reduced_df.pickle', 'rb') as f:
return pkl.load(f) return pkl.load(f)
\ No newline at end of file
import click import click
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from scipy import stats
from sklearn.impute import KNNImputer from sklearn.impute import KNNImputer
from prebio_df import read_prebio from prebio_df import read_prebio
......
...@@ -7,7 +7,7 @@ from typing import Any ...@@ -7,7 +7,7 @@ from typing import Any
class DataProvider: class DataProvider:
def __init__(self) -> None: def __init__(self) -> None:
self.data = {} # Format: Name: Dict({var_name: [x, y]}) self.data = {} # Formato: Nombre: Dict({nombre_variable: [x, y]})
@property @property
def ids(self): def ids(self):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment