diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..bf68fea41cb5b23f95c8c8887893482adc38cbb3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,169 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: + .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +# VS Code +.vscode/ + +# Prebio2 +results/models/* +!results/models/holder.txt +results/regression/*.xlsx +results-* diff --git a/README.md b/README.md index e8ee45298f6fc45eab2d7d16b84a59cfbf6e42b1..618ca3e62df1db30b48fbed683ac80f80e040e4a 100644 --- a/README.md +++ b/README.md @@ -1,93 +1,6 @@ # CO2-polycarbonates - - -## Getting started - -To make it easy for you to get started with GitLab, here's a list of recommended next steps. - -Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)! - -## Add your files - -- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files -- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command: - -``` -cd existing_repo -git remote add origin https://git.code.tecnalia.com/aritz.martinez/co2-polycarbonates.git -git branch -M main -git push -uf origin main -``` - -## Integrate with your tools - -- [ ] [Set up project integrations](https://git.code.tecnalia.com/aritz.martinez/co2-polycarbonates/-/settings/integrations) - -## Collaborate with your team - -- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/) -- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html) -- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically) -- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/) -- [ ] [Set auto-merge](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html) - -## Test and Deploy - -Use the built-in continuous integration in GitLab. - -- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html) -- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing (SAST)](https://docs.gitlab.com/ee/user/application_security/sast/) -- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html) -- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/) -- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html) - -*** - -# Editing this README - -When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thanks to [makeareadme.com](https://www.makeareadme.com/) for this template. - -## Suggestions for a good README - -Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information. - -## Name -Choose a self-explaining name for your project. - -## Description -Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors. - -## Badges -On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge. - -## Visuals -Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method. - -## Installation -Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection. - -## Usage -Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README. - -## Support -Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc. - -## Roadmap -If you have ideas for releases in the future, it is a good idea to list them in the README. - -## Contributing -State if you are open to contributions and what your requirements are for accepting them. - -For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self. - -You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser. - -## Authors and acknowledgment -Show your appreciation to those who have contributed to the project. - -## License -For open source projects, say how it is licensed. - -## Project status -If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers. +Steps to run the code: +1. Create python environment (v3.10) +2. `pip install -r requirements.txt` +3. run with: `python code/main.py` \ No newline at end of file diff --git a/code/dimensionality_reduction.py b/code/dimensionality_reduction.py new file mode 100644 index 0000000000000000000000000000000000000000..5e5ef8a0f87a240cddaacffafe712d28d06205dd --- /dev/null +++ b/code/dimensionality_reduction.py @@ -0,0 +1,99 @@ +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from sklearn.decomposition import PCA, KernelPCA +from sklearn.preprocessing import StandardScaler + + +class PCA_variance: + def __init__(self, target_variance, max_components=None, **kwargs) -> None: + self.target_variance = target_variance + self.max_components = max_components + self.best = None + self.pc_decomposition = None + self.num_components = None + self.explained_variance = None + self.kwargs = kwargs + + def run(self, data: pd.DataFrame) -> pd.DataFrame: + scaler = StandardScaler(with_mean=True, with_std=True) + x_standarized = pd.DataFrame(scaler.fit_transform(data), columns = data.columns) + + # If explained variance is 0 < expl_var < 1 it computes automatically in function of the explained variance + self.best = PCA(n_components=self.target_variance if self.max_components is None else self.max_components, + random_state=self.kwargs['random_state']) + + values = self.best.fit_transform(x_standarized) + + self.pc_decomposition = pd.DataFrame(self.best.components_, + columns=data.columns, + index = [f'PC-{i + 1}' for i, _ in enumerate(self.best.components_)]) + + self.num_components = self.best.n_components_ + self.explained_variance = self.best.explained_variance_ratio_ + + return pd.DataFrame(values, columns=[f'PC-{i}' for i in range(values.shape[1])]) + + def transform(self, data): + return self.best.transform(data) + + def inverse_transform(self, data): + return self.best.inverse_transform(data) + + def plot_variance(self): + sns.lineplot(self.explained_variance, color='#FF6961') + sns.despine() + plt.hlines(self.target_variance, 0, len(self.explained_variance), linestyles='dashed', colors='#4D82BC') + plt.text(len(self.explained_variance) - 40, + self.target_variance - (self.target_variance * 0.03), + f"Explained variance = {self.target_variance * 100}%", + fontfamily='fantasy', + color="#4D82BC") + plt.show() + + +class PCA_kernel: + def __init__(self,target_variance, kernel, num_components=None, **kwargs) -> None: + self.target_variance = target_variance + self.kernel = kernel + self.num_components = num_components + self.eigensol = ['auto','dense','arpack','randomized'] + self.fit_inverse_transform = True + self.kwargs = kwargs + + def run(self, data: pd.DataFrame) -> pd.DataFrame: + x_norm = pd.DataFrame(StandardScaler().fit_transform(data), columns = data.columns) + self.kernel_pca = KernelPCA(n_components = self.num_components, # Num components = None -> Quedarse con todas las que seas > 0 + kernel = self.kernel , + eigen_solver = self.eigensol[0], + fit_inverse_transform = self.fit_inverse_transform) + + pca_ft = self.kernel_pca.fit_transform(x_norm) + self.explained_variance = self.kernel_pca.eigenvalues_/np.sum(self.kernel_pca.eigenvalues_) + self.vectors = self.kernel_pca.eigenvectors_ + if self.num_components is not None: + num_comp_exp = next(i for i, s in enumerate([sum(self.explained_variance[:idx+1]) for idx in range(len(self.explained_variance))]) if s > self.target_variance) + else: + num_comp_exp = self.num_components + + pca_ft_exp = pca_ft[:,:num_comp_exp] + + return pd.DataFrame(pca_ft_exp, columns=[f'PC-{i}' for i in range(pca_ft_exp.shape[1])]) + + def transform(self, data): + return self.kernel_pca.transform(data) + + def inverse_transform(self, data): + return self.kernel_pca.inverse_transform(data) + + def plot_variance(self): + sns.lineplot(self.explained_variance, color='#FF6961') + sns.despine() + plt.hlines(self.target_variance, 0, len(self.explained_variance), linestyles='dashed', colors='#4D82BC') + plt.text(len(self.explained_variance) - 40, + self.target_variance - (self.target_variance * 0.03), + f"Explained variance = {self.target_variance * 100}%", + fontfamily='fantasy', + color="#4D82BC") + plt.show() diff --git a/code/feature_selection.py b/code/feature_selection.py new file mode 100644 index 0000000000000000000000000000000000000000..376b0ceef55eba80a82f38aefcfb1d73e9e055b8 --- /dev/null +++ b/code/feature_selection.py @@ -0,0 +1,148 @@ +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +# Machine learning +import xgboost as xgb +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import RandomForestRegressor +from sklearn.feature_selection import SelectKBest, f_regression, RFE + + +DELETE_PERC = 90 + +# ELBOW METHOD +def find_stable_index(metric_values: np.array, tolerance: float)-> float: + """ + Finds the index where a metric stabilizes in an array of values. + + Parameters: + - metric_values: A list or array of metric values. + - tolerance: The maximum allowed difference between successive values. + + Returns: + - The index where the metric stabilizes or None if not found. + """ + for i in range(1, len(metric_values)): + if abs(metric_values[i] - metric_values[i - 1]) <= tolerance: + return i + return None + +def select_optimun(array_s: np.array, tolerance: float) -> float: + + inertia_diff = [array_s[i] - array_s[i - 1] for i in range(1, len(array_s))] + index_opt = find_stable_index(inertia_diff,tolerance=tolerance) + return index_opt + + +# FEATURE SELECTORS +def q1_feature_selection(x: pd.DataFrame, y: pd.DataFrame, perc: float, stats: str = None) -> pd.DataFrame: + """ Selects the best X% features for each dependent variable (columns on y) + + Args: + x (pd.DataFrame): Independent variables dataframe + y (pd.DataFrame): Dependent variables dataframe + + Returns: + pd.DataFrame: Pandas dataframe containing the names of selected features and their scores + """ + name = y.columns[0] + + fs = SelectKBest(score_func=f_regression, k='all') # Aqui podriamos poner para que coja el 25% mejor (i.e. 75) + fs.fit(x, y[name]) + + scores_df = pd.DataFrame.from_dict({ 'Score': + dict(zip(x.columns, [s for s in fs.scores_]))}, + orient='index').T + all_scores = scores_df.sort_values(by='Score', ascending=False) + num_opt_features = select_optimun(np.array(all_scores['Score']), tolerance = 0.1) + num_perc_features = round(num_opt_features / x.shape[1] * 100, 2) + # Get only the most relevant features + indexes = np.where(fs.scores_ >= np.percentile(fs.scores_, perc)) + scores_df = scores_df.iloc[indexes[0]] + scores_df = scores_df.sort_values(by='Score', ascending=False)[:num_opt_features] + + if stats: + plt.figure(figsize=(20, 10)) + sns.barplot(all_scores.T, orient = 'v').set(title=f'Feature importance for {name}') + sns.despine() + plt.axvline(num_opt_features) + plt.hlines(all_scores.iloc[num_opt_features], 0, 800, linestyles='dashed', colors="#4D82BC", linewidth=2) + plt.text(810, all_scores.iloc[num_opt_features], f'{num_perc_features}%', fontsize=18, fontfamily='fantasy', color="#4D82BC") + plt.xticks([]) + plt.ylabel('f-score') + plt.xlabel('Features') + plt.tight_layout() + plt.savefig(f"./results/feature_extraction/{name.replace('/', '_')}_FI.png", transparent=True) + + scores_df.to_pickle(f"./results/feature_extraction/{name.replace('/', '_')}_{perc}%feat.pkl") + + return scores_df + +def rfe_selection(x: pd.DataFrame, y: pd.DataFrame, perc: float, estimator = None, **kwargs) -> pd.DataFrame: + """ Performs recursive feature elimination (RFE) using Decission Tree as estimator. + + Args: + x (pd.DataFrame): Independent variables dataframe + y (pd.DataFrame): Dependent variables dataframe + perc (float): Percentage of features to eliminate in % (0-100) + + Returns: + pd.DataFrame: Pandas dataframe containing the names of selected features and their scores + """ + if estimator is None: + estimator = DecisionTreeRegressor(random_state=kwargs['random_state']) + + name = y.columns[0] + num_feats = max(1, int(x.shape[1] - (perc / 100.0 * x.shape[1]))) + + rfe = RFE(estimator=estimator, n_features_to_select = num_feats) + rfe.fit(x, y[name]) + + result = pd.DataFrame(rfe.estimator_.feature_importances_, index=rfe.get_feature_names_out()) + result.to_pickle(f"./results/feature_extraction/{name.replace('/', '_')}_{perc}%feat_RFE.pkl") + + return result + +def xgboost_selection(x: pd.DataFrame, y: pd.DataFrame, tolerance=0.01, **kwargs) -> None: + xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=kwargs['random_state'], n_jobs=kwargs['n_jobs']) + xgb_model.fit(x, y) + + num_opt_features = select_optimun(xgb_model.feature_importances_, tolerance = tolerance) + results = pd.DataFrame(xgb_model.feature_importances_, + index=xgb_model.get_booster().feature_names, + columns=['Score']) + results = results.sort_values(by='Score', ascending=False)[:num_opt_features] + + # Log + if False: + xgb.plot_importance(xgb_model) + plt.show() + + return results + +def random_forest_selector(x: pd.DataFrame, y: pd.DataFrame, tolerance=0.01, **kwargs): + feature_names = list(x.columns) + forest = RandomForestRegressor(n_estimators= 200, random_state=kwargs['random_state'], n_jobs=kwargs['n_jobs']) + forest.fit(x, y) + + num_opt_features = select_optimun(forest.feature_importances_, tolerance = tolerance) + results = pd.DataFrame(forest.feature_importances_, + index=feature_names, + columns=['Score']) + + results = results.sort_values(by='Score', ascending=False)[:num_opt_features] + + # Plot + if False: + fig, ax = plt.subplots() + forest_importances = pd.Series(forest.feature_importances_, index=feature_names) + forest_importances.plot.bar(ax=ax) + ax.set_title("Feature importances using MDI") + ax.set_ylabel("Mean decrease in impurity") + fig.tight_layout() + plt.show() + + return results + \ No newline at end of file diff --git a/code/load_saved_models.py b/code/load_saved_models.py new file mode 100644 index 0000000000000000000000000000000000000000..77dac6aaf4fa5658acd516c8d34b6cde801fbeda --- /dev/null +++ b/code/load_saved_models.py @@ -0,0 +1,86 @@ +import pickle +import numpy as np +import pandas as pd +from typing import Tuple +from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error + + +class Ensemble: + def __init__(self, models) -> None: + """ Initializes an ensemble of machine learning models. + + Args: + models (list): A list of machine learning models to be included in the ensemble. + """ + self.ensemble = models + + def predict(self, x) -> np.array: + """ Generate predictions for the input data using the ensemble. + + Args: + x (array-like): The input data for making predictions. + + Returns: + array: The predicted values based on the ensemble's average prediction. + + Notes: + This method assumes that each model in the ensemble has a `predict` method. + """ + predictions = [] + for model in self.ensemble: + predictions.append(model.predict(x)) + + return predictions + + def predict_ensemble(self, x): + return np.mean(self.predict(x), axis=0) + + def score(self, x, y) -> np.array: + """ Calculate the ensemble's mean score on the given input data and target values. + + Args: + x (array-like): The input data for scoring. + y (array-like): The target values for scoring. + + Returns: + float: The mean score across all models in the ensemble. + + Notes: + This method assumes that each model in the ensemble has a `score` method. + """ + scores = [] + for model in self.ensemble: + scores.append(model.score(x, y)) + + return np.mean(scores, axis=0) + + def compute_mae_mape(self, x, y) -> np.array: + y_hat = self.predict(x) + maes = [] + mapes = [] + for pred in y_hat: + maes.append(mean_absolute_error(y, pred)) + mapes.append(mean_absolute_percentage_error(y, pred)) + + return maes, mapes + + def compute_mae_mape_ensemble(self, x, y): + maes, mapes = self.compute_mae_mape(x, y) + return np.mean(maes, axis=0), np.mean(mapes, axis=0) + + +def load_ensemble(data_name: str, model_data_path: str, seed: int) -> Tuple[pd.DataFrame, Ensemble]: + """Return the ensemble of models previously saved + + Args: + data_name (str): ID of thedata to be loaded + model_data_path (str): path to models.pickle file + + Returns: + Ensemble: The set of model on which predictions will be performed + """ + data_all = pickle.load(open(f'data/all_processed_{seed}.pkl','rb')) + ensemble = Ensemble(pickle.load(open(model_data_path, 'rb'))) + #print(data_name) + #print(model_data_path) + return data_all[data_name], ensemble diff --git a/code/main.py b/code/main.py new file mode 100644 index 0000000000000000000000000000000000000000..a2b9ccb3d141eb563ba50650f94e2a3fd540b838 --- /dev/null +++ b/code/main.py @@ -0,0 +1,175 @@ +import os +from glob import glob +import numpy as np +import pandas as pd +from tqdm import tqdm +from typing import Any +from pickle import dump as pdump + +# User imports +from prebio_df import read_imputed +from results_manager import DataProvider, Results +from preprocessing import impute_missing, delete_outliers +from regression import RegressorType, evaluate_regressor, compute_best_params +from dimensionality_reduction import PCA_variance, PCA_kernel +from feature_selection import xgboost_selection, random_forest_selector + + +class step: + def __init__(self) -> None: + self.count = 0 + + def __call__(self, *args: Any, **kwds: Any) -> int: + self.count += 1 + return self.count + + +# --------------------------------------------- +# Main +# --------------------------------------------- +def main(seed=1234): + ORIGINAL_DF_PATH = './data/polycarbonates_entregable.xlsx' + VAR_NAMES = ['Mn (kg·mol)-1', 'Mw/Mn', 'conversion'] + EXPLAINED_VARIANCE = 0.9 + + kwargs = {'random_state': np.random.RandomState(seed=seed), 'n_jobs': -1} + data_provider = DataProvider() # Data storage and manager + results = Results(seed) # Result saver and manager + step_n = step() # Counter class + + # --------------------------------------------- + # NaN imputation + # --------------------------------------------- + if not os.path.exists('./data/imputed_polycarbonates.xlsx'): + print(f"\n{'-' * 50}\nStep {step_n()}: NaN Filling\n{'-' * 50}") + filled_df = impute_missing(ORIGINAL_DF_PATH, **kwargs) # And adds a column with number of nans per row + # After saving imputed data remove NaNs column so it doesn't affect in the regression pipeline + filled_df = filled_df.drop(columns=['NaNs']) + else: + filled_df = read_imputed().drop(columns=['NaNs']) + + + # --------------------------------------------- + # Delete Outliers + # --------------------------------------------- + print(f"\n{'-' * 50}\nStep {step_n()}: Outlier Deletion\n{'-' * 50}") + outlier_indexes = {name.replace('/', ''): [] for name in VAR_NAMES} + data = {name.replace('/', ''): delete_outliers(filled_df, name, index_holder=outlier_indexes[name.replace('/', '')], **kwargs) + for name in VAR_NAMES} + + # Save outlier indexes! + with open('./data/outliers.pkl','wb') as f: + pdump(outlier_indexes, f) + f.close() + + VAR_NAMES[1] = VAR_NAMES[1].replace('/', '') + + for name in VAR_NAMES: + data_provider.set_data(f'RAW_{name}', data[name]) + + + # --------------------------------------------- + # Dimensionality Reduction: PCA / Kernel PCA + # --------------------------------------------- + print(f"\n{'-' * 50}\nStep {step_n()}: Dimensionality Reduction\n{'-' * 50}") + + pca_var = {name: PCA_variance(EXPLAINED_VARIANCE, **kwargs) for name in VAR_NAMES} + pca_sample_var = {name: PCA_variance(data[name][0].shape[0], **kwargs) for name in VAR_NAMES} + + pca_kernel_poly = {name: PCA_kernel(EXPLAINED_VARIANCE, 'poly',**kwargs) for name in VAR_NAMES} + pca_kernel_poly_sample = {name: PCA_kernel(data[name][0].shape[0], 'poly',**kwargs) for name in VAR_NAMES} + + pca_kernel_rbf = {name: PCA_kernel(EXPLAINED_VARIANCE, 'rbf', **kwargs) for name in VAR_NAMES} + pca_kernel_rbf_sample = {name: PCA_kernel(data[name][0].shape[0], 'rbf', **kwargs) for name in VAR_NAMES} + + for var_name in tqdm(VAR_NAMES): + x, y = data_provider.get_data(f'RAW_{var_name}') + + new_x = pca_var[var_name].run(x) + data_provider.set_data(f'RAW_PCA_var_{var_name}', [new_x, y]) + + new_x = pca_sample_var[var_name].run(x) + data_provider.set_data(f'RAW_PCA_sam_{var_name}', [new_x, y]) + + new_x = pca_kernel_poly[var_name].run(x) + data_provider.set_data(f'RAW_PCA_poly_var_{var_name}', [new_x, y]) + + new_x = pca_kernel_poly_sample[var_name].run(x) + data_provider.set_data(f'RAW_PCA_poly_sam_{var_name}', [new_x, y]) + + new_x = pca_kernel_rbf[var_name].run(x) + data_provider.set_data(f'RAW_PCA_rbf_var_{var_name}', [new_x, y]) + + new_x = pca_kernel_rbf_sample[var_name].run(x) + data_provider.set_data(f'RAW_PCA_rbf_sam_{var_name}', [new_x, y]) + + + # --------------------------------------------- + # Feature Selection: XGBoost and Random Forest + # --------------------------------------------- + print(f"\n{'-' * 50}\nStep {step_n()}: Feature Selection\n{'-' * 50}") + + for data_name in tqdm(data_provider.ids): + var_name = data_name.split('_')[-1] + name = data_name.replace(f'_{var_name}', '') + + x, y = data_provider.get_data(data_name) + y = np.ravel(y) + + xgb_vars = xgboost_selection(x, y, tolerance=0.01, **kwargs) + rf_vars = random_forest_selector(x, y, tolerance=0.01, **kwargs) + + data_provider.set_data(f'{name}_XGB_{var_name}', [x[xgb_vars.index], y]) + data_provider.set_data(f'{name}_RF_{var_name}', [x[rf_vars.index], y]) + + + # --------------------------------------------- + # Regression + # --------------------------------------------- + print(f"\n{'-' * 50}\nStep {step_n()}: Regression\n{'-' * 50}") + + if not glob('./configs/*'): + for name in tqdm(data_provider.ids, desc='Computing best params'): + d = data_provider.get_data(name) + compute_best_params(d[0], d[1], name, verbose= 0, **kwargs) + + for name in tqdm(data_provider.ids): + var_name = name.split('_')[-1] + row_name = name.replace(f'_{var_name}', '') + + x, y = data_provider.get_data(name) + + score_mean, score_std, regressors = evaluate_regressor(RegressorType.LINEAR, x, y, name, **kwargs) + results.set_value(var_name, row_name, 'LR(mean)', score_mean) + results.set_value(var_name, row_name, 'LR(std)', score_std) + results.set_models(f'{name}_linear', regressors) + + score_mean, score_std, regressors = evaluate_regressor(RegressorType.SVR, x, y, name, **kwargs) + results.set_value(var_name, row_name, 'SVR(mean)', score_mean) + results.set_value(var_name, row_name, 'SVR(std)', score_std) + results.set_models(f'{name}_svr', regressors) + + score_mean, score_std, regressors = evaluate_regressor(RegressorType.RANDOM_FOREST, x, y, name, **kwargs) + results.set_value(var_name, row_name, 'RF(mean)', score_mean) + results.set_value(var_name, row_name, 'RF(std)', score_std) + results.set_models(f'{name}_forest', regressors) + + # --------------------------------------------- + # SAVE! + # --------------------------------------------- + data_provider.to_pickle(path = f'./data/all_processed_{seed}.pkl') + results.to_excel() + results.save_models(path=f'./results/models/{seed}/') + + +if __name__ == "__main__": + var_names = ['Mn(kg·mol)-1', 'MwMn', 'conversion'] + seeds = [645, 4578, 72, 2365, 90345, 24, 1859, 1334, 2078, 2446, 7409, 6995, 2041, 449, 9475] + + for seed in seeds: + main(seed=seed) + + for name in var_names: + dfs = [pd.read_excel(f"./results/regression/{name}_{seed}.xlsx", index_col=0) for seed in seeds] + mean_df = pd.concat(dfs).groupby(level=0).mean() + mean_df.to_excel(f"./results/regression/{name.replace(' ','').replace('/','_')}_mean.xlsx") diff --git a/code/nan_analysis.py b/code/nan_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..bc4d58020d25f5780b9b19374c506efc213b090c --- /dev/null +++ b/code/nan_analysis.py @@ -0,0 +1,63 @@ +import numpy as np +import pandas as pd + + +def missing_values_table(df: pd.DataFrame, title:str=None) -> pd.DataFrame: + """Generate a table of missing values in the dataframe. + + Args: + df (pd.DataFrame): Input dataframe. + title (str, optional): Title for the missing values table. Defaults to '_____ NAN ANALYSIS {df.shape} _____'. + + Returns: + pd.DataFrame: Table of missing values with columns 'Missing Values' and '% of Total Values'. + """ + mis_val = df.isnull().sum() + mis_val_percent = 100 * mis_val / len(df) + mis_percent = 100 * sum(mis_val.values) / np.prod(df.shape) + mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1) + + new_columns = {0 : 'Missing Values', 1 : '% of Total Values'} + mis_val_table_ren_columns = mis_val_table.rename(columns = new_columns) + + mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0] \ + .sort_values('% of Total Values', ascending=False) \ + .round(1) + + msg = f'_____ NAN ANALYSIS {df.shape} _____' + print(f'{msg if title is None else title}') + print(f'> Your selected dataframe has {df.shape[1]} columns and {df.shape[0]} samples.') + print(f'> A {mis_percent: .4}% of the dataset is missing') + columns_with_nans = mis_val_table_ren_columns.where(mis_val_table_ren_columns['% of Total Values'] > 0).index + print(f'> There are {len(columns_with_nans)} columns with missing values.') + + if not mis_val_table_ren_columns.empty: + numerical_cols = df[columns_with_nans].select_dtypes(include=np.number) + categorical_cols = df[columns_with_nans].select_dtypes(include=object) + print("> From those columns:") + print(f" > {len(categorical_cols.columns)} columns are categorical") + print(f" > {len(numerical_cols.columns)} columns are numerical") + + return mis_val_table_ren_columns + +def get_nans_per_row(df) -> pd.DataFrame: + """ Count number of NaNs per row and add a column + + Returns: + pd.DataFrame: Dataframe with the column of number of NaNs added + """ + data = df + + cols = list(data.columns) + cols.append('NaNs') + + new_data = pd.DataFrame(columns=cols) + + for index in data.index: + row = data.iloc[index, :] + nans = row.isnull().sum() + new_row = list(row) + new_row.append(nans) + new_data.loc[len(new_data)] = new_row + + return new_data diff --git a/code/prebio_df.py b/code/prebio_df.py new file mode 100644 index 0000000000000000000000000000000000000000..3035477424d39e310e5ce90326f50da344f736a1 --- /dev/null +++ b/code/prebio_df.py @@ -0,0 +1,68 @@ +import numpy as np +import pandas as pd +import pickle as pkl +from typing import Dict, Tuple +from nan_analysis import missing_values_table + + +def read_prebio(path: str, preprocess: bool = True) -> pd.DataFrame: + """Load PREBIO2 dataset and optionally preprocess) + + Args: + path (str): Path to file + preprocess (bool, optional): Apply the preprocessing schedule designed for PREBIO2 or not. Defaults to True. + + Returns: + pd.DataFrame: Loaded pandas DataFrame + """ + df = pd.read_excel(path, decimal=',', header=3) + df = df.iloc[0:201,0:1179].copy() + return preprocess_prebio(df) if preprocess else df + + +def preprocess_prebio(df: pd.DataFrame) -> pd.DataFrame: + """Applies the preprocessing steps to PREBIO2 DF + + Args: + df (pd.DataFrame): The PREBIO2 dataframe loaded as pandas DF + + Returns: + pd.DataFrame: Preprocessed dataframe + """ + # Note that we have computed some values in the 'time (h)' column by hand. Rows 199,200,201 and 202 + df = df.drop(columns=['Code']) + df = df.drop(columns=['Tg (ºC)']) # We're dropping because it is completely empty! + df = df.replace(regex=True, to_replace='^invalid value encountered', value=np.nan) + df = df.replace(regex=True, to_replace='^float division by zero', value=np.nan) + df = df.replace(regex=True, to_replace='^min', value=np.nan) + df = df.replace(regex=True, to_replace='^max', value=np.nan) + df = df.replace(regex=True, to_replace='^missing intrinsic state for', value=np.nan) + for i in range(0, 4): + df = df.replace(regex=True, to_replace=f'^288-{552 + i}', value=np.nan) + df['Epoxide'] = df['Epoxide'].fillna('Unknown') + + # We drop columns with > 70% of data missing. + missing_table = missing_values_table(df) + more_than_70_labels = missing_table[missing_table['% of Total Values'] > 70].index + print(more_than_70_labels) + print(len(more_than_70_labels)) + input() + + df = df.drop(columns = more_than_70_labels) + + return df + + +def read_imputed(path: str = './data/imputed_polycarbonates.xlsx') -> pd.DataFrame: + return pd.read_excel(path, decimal=',', header=0, index_col=False).select_dtypes('number') + + +def load_split_df(path: str = './data/imputed_polycarbonates.xlsx') -> tuple: + df = read_imputed(path).select_dtypes('number') + dependent_cols = ['Mn (kg·mol)-1', 'Mw/Mn', 'conversion'] + return df.drop(dependent_cols, axis=1), df[dependent_cols] + + +def load_reduced_df() -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]: + with open('./data/reduced_df.pickle', 'rb') as f: + return pkl.load(f) \ No newline at end of file diff --git a/code/preprocessing.py b/code/preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..e5d0020dd3fcfec994f59efb0bdbbdd57bf2fc9a --- /dev/null +++ b/code/preprocessing.py @@ -0,0 +1,132 @@ +import click +import numpy as np +import pandas as pd +from sklearn.impute import KNNImputer + +from prebio_df import read_prebio +from nan_analysis import missing_values_table, get_nans_per_row + +# --------------------------------------------- +# Helper functions +# --------------------------------------------- +def knn_custom_imputation(target: pd.Series, set: pd.DataFrame) -> pd.DataFrame: + """Impute missing values in the dataset using KNN imputation. + + Args: + target (pd.Series): Target row with missing values to be imputed. + set (pd.DataFrame): Dataset containing the target row and other rows for imputation. + + Returns: + pd.DataFrame: Dataset with missing values imputed using KNN imputation. + """ + set.loc[len(set)] = target + imputer = KNNImputer() + res = imputer.fit_transform(set) + + return pd.DataFrame(res, columns = set.columns) + +def check_complete_samples(df: pd.DataFrame) -> tuple: + """Find all samples with 0% of missing values + + Args: + df (pd.DataFrame): Input dataframe + + Returns: + tuple: A tuple containing the indexes of samples with 0% missing values (empty if none), + an array of NaN counts for each row, and the unique counts of NaNs. + """ + nans = np.array([r.isnull().sum() for _, r in df.iterrows()]) + indexes = np.where(nans == 0)[0] + print(f'> The amount of instances with 0% of missing values is {len(indexes)}!') + unique, counts = np.unique(nans, return_counts=True) + print(dict(zip(unique, counts))) + + return indexes, nans, unique + + +# --------------------------------------------- +# Main functions +# --------------------------------------------- +def delete_outliers(df: pd.DataFrame, var_name:str, index_holder = [] , dependent_vars = ['Mn (kg·mol)-1', 'Mw/Mn', 'conversion'], **kwargs) -> list: + """Delete outliers from the dataset using the interquartile range. + + Args: + df (pd.DataFrame): _description_ + var_name (str): _description_ + dependent_vars (list, optional): _description_. Defaults to ['Mn (kg·mol)-1', 'Mw/Mn', 'conversion']. + + Returns: + list: _description_ + """ + print(f"Deleting outliers for {var_name.capitalize()}") + df_name = df.copy(deep=True) + index_holder.clear() + + # Delete outliers using the interquartile range + x, y = df_name.drop(dependent_vars, axis=1), pd.DataFrame(df_name[var_name]) + + # Compute interquantile range and min and max values + q1 = y.quantile(0.25).iloc[0] + q3 = y.quantile(0.75).iloc[0] + iqr = q3 - q1 + + min_lim = q1 - 1.5 * iqr + max_lim = q3 + 1.5 * iqr + + outliers = np.where((y < min_lim) | (y > max_lim))[0] + + # This makes outlier indexes be accesible through index_holder + for o in outliers: + index_holder.append(o) + + print(f'{len(outliers)} deleted') + x = x.drop(index=outliers) + y = np.ravel(y.drop(index=outliers)) + + return x, y + +def impute_missing(ds: str='../data/polycarbonates_entregable.xlsx', df = None, save = True, **kwargs) -> pd.DataFrame: + """Main function to preprocess the dataset by imputing missing values and storing the result. + + Args: + ds (str): Path to the prebio2 dataset (.xlsx). + """ + if df is None: + df = read_prebio(ds) + df = get_nans_per_row(df) + + # Get all the samples with a 0% of missing values + indexes, nans, unique = check_complete_samples(df) + full_df = df.iloc[indexes] + + # For ease imputing new data we delete categorical columns (None of them contains NaNs) + full_df = full_df.drop(columns=['Epoxide', 'SMILES', 'Catalyst']) + + # Assert the new dataset is empty + assert missing_values_table(full_df).empty, "This shouldn't be happening. Your full_df has empty values!" + + # Impute missing data sequentially + for min_nan in np.sort(unique)[1:]: # We omit taking samples with 0% of missing values as they will be repeated + for index in np.where(nans == min_nan)[0]: + full_df = knn_custom_imputation(df.iloc[index], full_df) + + df.update(full_df) + missing_values_table(df, title='------ Imputed DataFrame ------') + + if save: + path = './data/imputed_polycarbonates.xlsx' + print(f'The new dataset will be stored in {path}') + df.to_excel(path, index = False) + + return df.select_dtypes('number') + + +@click.command +@click.option('--ds', default='./data/polycarbonates_entregable.xlsx', help='Path to prebio2 dataset (.xlsx)') +def main(ds): + impute_missing(ds) + + +if __name__ == "__main__": + main() + \ No newline at end of file diff --git a/code/regression.py b/code/regression.py new file mode 100644 index 0000000000000000000000000000000000000000..bd6132f8098c7e51a8cb5ce3955655fab1480c2a --- /dev/null +++ b/code/regression.py @@ -0,0 +1,189 @@ +import pickle +import tqdm +import numpy as np +import pandas as pd +from enum import Enum +from sklearn.svm import SVR +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import GridSearchCV, RandomizedSearchCV +from sklearn.model_selection import LeaveOneOut, cross_validate +from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error +from load_saved_models import Ensemble + + +class RegressorType(Enum): + """An identifier of the regressors we've implemented """ + LINEAR = 1, + SVR = 2, + RANDOM_FOREST = 3 + + +# --------------------------------------------- +# Grid Search +# --------------------------------------------- +def scikit_GS(regressor: callable, parameters: dict, X: pd.DataFrame, y: pd.DataFrame, verbose = 0, random_gs = False, **kwargs): + """ Perform a Grid Search over a set of parameters and return initialized regressor + Only works with scikit regressors + + Args: + regressor (callable): scikit regressor callable + parameters (dict): set of parameters where the grid search is performed + X (pd.DataFrame): fit x data + y (pd.DataFrame): fit y data + + Returns: + The best parameters + """ + file_name = kwargs.pop('file_name') + n_jobs = kwargs.setdefault('n_jobs', -1) + n_jobs = kwargs.pop('n_jobs') + + if random_gs: + clf = RandomizedSearchCV( + regressor(**kwargs), parameters, + n_iter = 100, + scoring = 'neg_mean_squared_error', + cv = LeaveOneOut(), + verbose = verbose, + n_jobs = n_jobs) + else: + clf = GridSearchCV(regressor(**kwargs), parameters, + scoring = 'neg_root_mean_squared_error', + cv = LeaveOneOut(), + verbose = verbose, + n_jobs = n_jobs) + clf.fit(X, y) + + if not random_gs: # Only save if random search is not performed + print(f'Best parameters for {regressor}: {clf.best_params_}') + with open(f"./configs/{file_name}.pkl",'wb') as f: + pickle.dump(clf.best_params_, f) + f.close() + + return clf.best_params_ + + +def find_params(regressor: RegressorType, x, y, verbose = 0, **kwargs): + """ Trains and performs grid search over a set of regressors (which can be extended in RegressorType) + and returns the score. + + Args: + regressor (RegressorType): _description_ + x (list, np.array): list of train samples + y (list, np.array): list of train targets + verbose (int, optional): The level of detail for Grid Search. Defaults to 0. + """ + random = kwargs.pop('random_state') + + if regressor == RegressorType.LINEAR: + LinearRegression() # Doesn't have any parameter to tune + + if regressor == RegressorType.SVR: + params = {'kernel': ['poly', 'rbf'], + 'C': np.arange(0.1, 1.0, 0.3), + 'epsilon': np.arange(0, 1, 0.1)} + + scikit_GS(SVR, params, x, y, verbose = verbose, **kwargs) + + if regressor == RegressorType.RANDOM_FOREST: + random_params = {'n_estimators': [10, 50, 100, 150, 200], + 'max_depth': [None] + list(np.random.randint(1, 20, 10)), + 'min_samples_split': list(np.random.randint(2, 20, 10)), + 'min_samples_leaf': list(np.random.randint(1, 20, 10)), + 'max_features': ['sqrt', 'log2', None] + list(np.random.uniform(0.1, 0.9, 10))} + + br = scikit_GS(RandomForestRegressor, random_params, x, y, + verbose = verbose, + random_state=random, + random_gs=True, + **kwargs) + + mss = br['min_samples_split'] + msl = br['min_samples_leaf'] + min_depth = max(2, br['max_depth'] - 2) if br['max_depth'] is not None else 0 + max_depth = max(br['max_depth'] + 3, 5) if br['max_depth'] is not None else 0 + + new_params = {'n_estimators': [br['n_estimators']], + 'max_depth': [None] + list(range(min_depth, max_depth)) if br['max_depth'] is not None else [None, 2, 5], + 'min_samples_split': [max(2, mss - 2), mss, mss + 2], + 'min_samples_leaf': [max(2, msl - 2), msl, msl + 2], + 'max_features': [br['max_features']]} + + scikit_GS(RandomForestRegressor, new_params, x, y, + verbose=verbose, + random_state=random, + **kwargs) + + +def compute_best_params(x, y, name: str, verbose = 0, **kwargs): + + kwargs['file_name'] = f"SVR_{name}" + find_params(RegressorType.SVR, x, y, verbose = verbose, **kwargs) + + kwargs['file_name'] = f"RF_{name}" + kwargs['bootstrap'] = True + find_params(RegressorType.RANDOM_FOREST, x, y, verbose = verbose, **kwargs) + + + +# --------------------------------------------- +# Regressor Utils +# --------------------------------------------- +def get_regressor(type: RegressorType, name: str, **kwargs) -> callable: + # Returns the regressor specified with GS loaded parameters + if type == RegressorType.LINEAR: + regressor = LinearRegression() + + elif type == RegressorType.SVR: + my_kwargs = pickle.load(open(f"./configs/SVR_{name}.pkl",'rb')) + regressor = SVR(**my_kwargs) + + elif type == RegressorType.RANDOM_FOREST: + my_kwargs = pickle.load(open(f"./configs/RF_{name}.pkl",'rb')) + my_kwargs['random_state'] = kwargs['random_state'] + # my_kwargs['n_jobs'] = kwargs['n_jobs'] + regressor = RandomForestRegressor(**my_kwargs) + + return regressor + + +def evaluate_regressor(regressor_type, x, y, name, **kwargs): + # Load regressor with best params + regressor = get_regressor(regressor_type, name, **kwargs) + + scores = cross_validate(regressor, x, y, + scoring='neg_mean_absolute_error', + cv=LeaveOneOut(), + n_jobs=kwargs['n_jobs'], + return_estimator=True) + + ensemble = Ensemble(scores['estimator']) + score = ensemble.score(x, y) + + return np.mean(score), np.std(score), scores['estimator'] + + +def regressor_fit(regressor_type, x, y, name, **kwargs): + regressor = get_regressor(regressor_type, name, **kwargs) + loo = LeaveOneOut() + + maes = [] + yhat = [] + for train_index, test_index in tqdm.tqdm(loo.split(x)): + X_train, X_test = x.iloc[train_index], x.iloc[test_index] + y_train, y_test = y[train_index], y[test_index] + + # Fit the random forest on the training data + regressor.fit(X_train, y_train) + + # Make predictions on the test data + y_pred = regressor.predict(X_test) + + # Calculate accuracy and store it + maes.append(mean_absolute_error(y_test, y_pred)) + yhat.append(float(y_pred)) + + mape = mean_absolute_percentage_error(y, yhat) + + return yhat, maes, mape, regressor diff --git a/code/results_manager.py b/code/results_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..23873a5335060117faaff91c4eb2b46b4f439a4b --- /dev/null +++ b/code/results_manager.py @@ -0,0 +1,80 @@ +import os +import pickle +import regex as re +import pandas as pd +from typing import Any + + +class DataProvider: + def __init__(self) -> None: + self.data = {} # Format: Name: Dict({var_name: [x, y]}) + + @property + def ids(self): + return list(self.data.keys()) + + def get_id_regex(self, regex): + names = [] + for id in self.ids: + res = re.findall(regex, id) + if res: + names.append(id) + + return names + + def set_data(self, name:str, data: Any): + self.data[name] = data + + def get_data(self, name): + return self.data[name] + + def to_pickle(self, path = './data/all_processed.pkl'): + with open(path, 'wb') as f: + pickle.dump(self.data, f) + + +class Results: + def __init__(self, seed = None) -> None: + + var_names = ['Mn (kg·mol)-1', 'MwMn', 'conversion'] + + rname = ['RAW','RAW_PCA_var','RAW_PCA_sam','RAW_PCA_poly_var','RAW_PCA_poly_sam','RAW_PCA_rbf_var','RAW_PCA_rbf_sam','RAW_XGB', 'RAW_PCA_var_XGB','RAW_PCA_sam_XGB','RAW_PCA_poly_var_XGB','RAW_PCA_poly_sam_XGB','RAW_PCA_rbf_var_XGB','RAW_PCA_rbf_sam_XGB', 'RAW_RF','RAW_PCA_var_RF', 'RAW_PCA_sam_RF','RAW_PCA_poly_var_RF','RAW_PCA_poly_sam_RF','RAW_PCA_rbf_var_RF','RAW_PCA_rbf_sam_RF'] + + cname = ['LR(mean)', 'LR(std)', 'SVR(mean)','SVR(std)','RF(mean)','RF(std)'] + self.seed = seed + self.results = {} + self.models = {} + + for name in var_names: + self.results[name] = pd.DataFrame(index=rname, columns=cname) + + def set_value(self, var_name, row_name, col_name, value): + self.results[var_name].at[row_name, col_name] = value + + def set_models(self, name, models): + self.models[name] = models + + def retrieve_data(self, var_name): + return self.results[var_name] + + def to_csv(self, path = './results/regression/'): + for k in self.results: + n = f"{path}{k.replace(' ', '').replace('/', '_')}" + n += f'_{self.seed}.csv' if self.seed is not None else '' + + self.results[k].to_csv(n) + + def to_excel(self, path = './results/regression/'): + for k in self.results: + n = f"{path}{k.replace(' ', '').replace('/', '_')}" + n += f'_{self.seed}.xlsx' if self.seed is not None else '' + + self.results[k].to_excel(n) + + def save_models(self, path='./results/models/'): + os.makedirs(path, exist_ok=True) + for k in self.models: + with open(f'{path}{k}.pkl','wb') as f: + pickle.dump(self.models[k], f) + f.close() + \ No newline at end of file diff --git a/configs/holder.txt b/configs/holder.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/data/imputed_polycarbonates.xlsx b/data/imputed_polycarbonates.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..eb2742fcd5b21612c1621ea425f134e8df31c353 Binary files /dev/null and b/data/imputed_polycarbonates.xlsx differ diff --git a/data/polycarbonates_entregable.xlsx b/data/polycarbonates_entregable.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..63785cfb14e02ee503782de430c8c43fcb55ae2b Binary files /dev/null and b/data/polycarbonates_entregable.xlsx differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c63110e1fe5bc3f7c1c257ffd75106dd83303bb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,26 @@ +click==8.1.7 +colorama==0.4.6 +contourpy==1.2.1 +cycler==0.12.1 +et-xmlfile==1.1.0 +fonttools==4.53.0 +joblib==1.4.2 +kiwisolver==1.4.5 +matplotlib==3.9.0 +numpy==2.0.0 +openpyxl==3.1.4 +packaging==24.1 +pandas==2.2.2 +pillow==10.3.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +pytz==2024.1 +regex==2024.5.15 +scikit-learn==1.5.0 +scipy==1.14.0 +seaborn==0.13.2 +six==1.16.0 +threadpoolctl==3.5.0 +tqdm==4.66.4 +tzdata==2024.1 +xgboost==2.1.0 diff --git a/results/models/holder.txt b/results/models/holder.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/results/regression/holder.txt b/results/regression/holder.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391