Skip to content

Commit

Permalink
Merge pull request #37 from GauravPandeyLab/minor-fixes
Browse files Browse the repository at this point in the history
Minor fixes
  • Loading branch information
03bennej committed Oct 26, 2023
2 parents eb3fba4 + 52846d3 commit 04dd165
Show file tree
Hide file tree
Showing 9 changed files with 71 additions and 45 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ jobs:

steps:
- name: Check out the repository
uses: actions/checkout@v1
uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}

Expand Down
3 changes: 2 additions & 1 deletion docs/source/api_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ API Reference
:maxdepth: 2

ensemble_integration
permutation_interpreter
permutation_interpreter
datasets
4 changes: 4 additions & 0 deletions docs/source/datasets.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Datasets
--------

.. autofunction:: eipy.datasets.load_diabetes
75 changes: 41 additions & 34 deletions eipy/dataset.py → eipy/datasets.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
import pandas as pd
import numpy as np
import os
from os import environ, listdir, makedirs
from os.path import expanduser, isdir, join, splitext

from urllib.request import urlretrieve
import os
from os import environ, makedirs
from os.path import expanduser, join
import wget
import zipfile



def _load_csv(file_path, fn, suffix):
return pd.read_csv(join(file_path, f"{fn}_{suffix}.csv"),
index_col=0)
return pd.read_csv(join(file_path, f"{fn}_{suffix}.csv"), index_col=0)



def get_data_home(data_home=None):
"""Return the path of the eipy data directory.
Expand Down Expand Up @@ -48,47 +44,58 @@ def get_data_home(data_home=None):
makedirs(data_home, exist_ok=True)
return data_home

def _check_dirExist_mkdir(folder_path):
if os.path.exists(folder_path):
return True
else:
os.makedirs(folder_path)
return False

def load_diabetes():
"""
Loads a multi-modal youth diabetes dataset.
More information about this dataset can be found in the following publication
Catherine McDonough, Yan Chak Li, Nita Vangeepuram, Bian Liu, Gaurav Pandey.
Facilitating youth diabetes studies with the most comprehensive epidemiological
dataset available through a public web portal. medRxiv 2023.08.02.23293517.
https://doi.org/10.1101/2023.08.02.23293517
"""
zenodo_link = "https://zenodo.org/records/10035422/files/diabetes.zip?download=1"
# Get data path
data_path = get_data_home()
folder_ext = "diabetes"
data_ext_path = join(data_path, folder_ext)
# check data downloaded before
folder_exist = os.path.exists(data_ext_path)
zip_exist = os.path.exists(data_ext_path+'.zip')
zip_exist = os.path.exists(data_ext_path + ".zip")
if not folder_exist:
if not zip_exist:
filename = wget.download(zenodo_link, out=data_path)
downloaded_path = data_ext_path+'.zip'
with zipfile.ZipFile(downloaded_path, 'r') as zip_ref:
wget.download(zenodo_link, out=data_path)
downloaded_path = data_ext_path + ".zip"
with zipfile.ZipFile(downloaded_path, "r") as zip_ref:
zip_ref.extractall(data_path)

_file_path = data_ext_path
modality_keys = ['Sociodemographic', 'Health status',
'Diet', 'Other lifestyle behaviors']
_train_suffix = '9916'
_test_suffix = '1618'
modality_keys = [
"Sociodemographic",
"Health status",
"Diet",
"Other lifestyle behaviors",
]
_train_suffix = "9916"
_test_suffix = "1618"
X_train = {k: _load_csv(_file_path, k, _train_suffix) for k in modality_keys}
X_test = {k: _load_csv(_file_path, k, _test_suffix) for k in modality_keys}
y_train = _load_csv(_file_path, 'outcomes_label', _train_suffix)
y_test = _load_csv(_file_path, 'outcomes_label', _test_suffix)
y_train = _load_csv(_file_path, "outcomes_label", _train_suffix)
y_test = _load_csv(_file_path, "outcomes_label", _test_suffix)
dictionary = pd.read_csv(join(_file_path, "data_dictionary.csv"))

return {'X_train': X_train,
'y_train': y_train,
'X_test': X_test,
'y_test': y_test,
'data_dict': dictionary}
return {
"X_train": X_train,
"y_train": y_train,
"X_test": X_test,
"y_test": y_test,
"data_dict": dictionary,
}


if __name__ == '__main__':
if __name__ == "__main__":
loaded_dictionary = load_diabetes()
print(loaded_dictionary['X_train'])
print(loaded_dictionary["X_train"])
16 changes: 12 additions & 4 deletions eipy/ei.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ class EnsembleIntegration:
Training data for ensemble methods, for each outer fold.
len(ensemble_training_data) = len(k_outer)
ensemble_test_data : list of pandas.DataFrame
Test data for ensemble methods, for each outer fold. len(ensemble_test_data) = len(k_outer)
Test data for ensemble methods, for each outer fold.
len(ensemble_test_data) = len(k_outer)
ensemble_predictions : pandas.DataFrame
Combined predictions (across all outer folds) made by each ensemble method.
modality_names : list of str
Expand Down Expand Up @@ -161,7 +162,10 @@ def __init__(
self.model_building = model_building
self.verbose = verbose

self.final_models = {"base models": {}, "ensemble models": {}} # for final model
self.final_models = {
"base models": {},
"ensemble models": {},
} # for final model
self.ensemble_training_data_final = None # for final model

self.cv_outer = StratifiedKFold(
Expand Down Expand Up @@ -289,7 +293,9 @@ def fit_ensemble(self, ensemble_predictors=None):
ensemble_predictions["labels"] = y_test_combined

self.ensemble_predictions = pd.DataFrame.from_dict(ensemble_predictions)
self.ensemble_summary = ensemble_summary(self.ensemble_predictions, self.metrics)
self.ensemble_summary = ensemble_summary(
self.ensemble_predictions, self.metrics
)

if self.model_building:
for model_name, model in tqdm(
Expand Down Expand Up @@ -360,7 +366,9 @@ def predict(self, X_dict, ensemble_model_key):
ensemble_prediction_data[0].T.groupby(level=[0, 1]).mean().T
)

ensemble_model = pickle.loads(self.final_models["ensemble models"][ensemble_model_key])
ensemble_model = pickle.loads(
self.final_models["ensemble models"][ensemble_model_key]
)

y_pred = safe_predict_proba(ensemble_model, ensemble_prediction_data)
return y_pred
Expand Down
6 changes: 3 additions & 3 deletions eipy/interpretation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ class PermutationInterpreter:
n_repeats : int, default=10
Number of repeats in PermutationImportance.
ensemble_predictor_keys: default='all'
Ensemble predictor keys used in EnsembleIntegration. If 'all' then all ensemble predictors
seen by EI are interpreted. Recommended to pass a subset of ensemble_predctor keys as
a list.
Ensemble predictor keys used in EnsembleIntegration. If 'all' then all
ensemble predictors seen by EI are interpreted. Recommended to pass a
subset of ensemble_predctor keys as a list.
metric_greater_is_better: default=True
Metric greater is better.
Expand Down
2 changes: 1 addition & 1 deletion eipy/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def ensemble_summary(ensemble_predictions, metrics):
return create_metric_threshold_dict(X, labels, metrics)


# These two functions are an attempt at maximizing/minimizing any metric but they were fairly slow
# These two functions are an attempt at maximizing/minimizing any metric
# def metric_scaler_function(arg, y_true, y_pred, metric, pos_label, multiplier):
# threshold = np.sort(np.unique(y_pred))[int(np.round(arg))]
# y_binary = (y_pred >= threshold).astype(int)
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ shap = ">=0.42"
xgboost = ">=1.7"
pandoc = "^2.3"
dill = "^0.3.7"
wget = "^3.2"

[tool.poetry.group.dev.dependencies]
pytest = ">=6.0"
Expand Down
5 changes: 5 additions & 0 deletions tests/test_load_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from eipy.datasets import load_diabetes
import pytest

def test_load_diabetes():
data = load_diabetes()

0 comments on commit 04dd165

Please sign in to comment.