Merge pull request #37 from GauravPandeyLab/minor-fixes

Minor fixes
GauravPandeyLab · Oct 26, 2023 · 04dd165 · 04dd165
2 parents eb3fba4 + 52846d3
commit 04dd165
Show file tree

Hide file tree

Showing 9 changed files with 71 additions and 45 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -24,9 +24,9 @@ jobs:
 
  steps:
  - name: Check out the repository
- uses: actions/checkout@v1
+ uses: actions/checkout@v3
  - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v1
+ uses: actions/setup-python@v3
  with:
  python-version: ${{ matrix.python-version }}
 

diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst
@@ -5,4 +5,5 @@ API Reference
  :maxdepth: 2
 
  ensemble_integration
- permutation_interpreter
+ permutation_interpreter
+ datasets
diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
@@ -0,0 +1,4 @@
+Datasets
+--------
+
+.. autofunction:: eipy.datasets.load_diabetes
diff --git a/eipy/dataset.py → eipy/datasets.py b/eipy/dataset.py → eipy/datasets.py
@@ -1,19 +1,15 @@
 import pandas as pd
-import numpy as np
-import os 
-from os import environ, listdir, makedirs
-from os.path import expanduser, isdir, join, splitext
-
-from urllib.request import urlretrieve
+import os
+from os import environ, makedirs
+from os.path import expanduser, join
 import wget
 import zipfile
-
+
+
 def _load_csv(file_path, fn, suffix):
- return pd.read_csv(join(file_path, f"{fn}_{suffix}.csv"),
- index_col=0)
+ return pd.read_csv(join(file_path, f"{fn}_{suffix}.csv"), index_col=0)
 
 
-
 def get_data_home(data_home=None):
  """Return the path of the eipy data directory.
 
@@ -48,47 +44,58 @@ def get_data_home(data_home=None):
  makedirs(data_home, exist_ok=True)
  return data_home
 
-def _check_dirExist_mkdir(folder_path):
- if os.path.exists(folder_path):
- return True
- else:
- os.makedirs(folder_path)
- return False
 
 def load_diabetes():
+ """
+ Loads a multi-modal youth diabetes dataset.
+
+ More information about this dataset can be found in the following publication
 
+ Catherine McDonough, Yan Chak Li, Nita Vangeepuram, Bian Liu, Gaurav Pandey.
+ Facilitating youth diabetes studies with the most comprehensive epidemiological
+ dataset available through a public web portal. medRxiv 2023.08.02.23293517.
+ https://doi.org/10.1101/2023.08.02.23293517
+
+ """
  zenodo_link = "https://zenodo.org/records/10035422/files/diabetes.zip?download=1"
  # Get data path
  data_path = get_data_home()
  folder_ext = "diabetes"
  data_ext_path = join(data_path, folder_ext)
  # check data downloaded before
  folder_exist = os.path.exists(data_ext_path)
- zip_exist = os.path.exists(data_ext_path+'.zip')
+ zip_exist = os.path.exists(data_ext_path + ".zip")
  if not folder_exist:
  if not zip_exist:
- filename = wget.download(zenodo_link, out=data_path)
- downloaded_path = data_ext_path+'.zip'
- with zipfile.ZipFile(downloaded_path, 'r') as zip_ref:
+ wget.download(zenodo_link, out=data_path)
+ downloaded_path = data_ext_path + ".zip"
+ with zipfile.ZipFile(downloaded_path, "r") as zip_ref:
  zip_ref.extractall(data_path)
- 
+
  _file_path = data_ext_path
- modality_keys = ['Sociodemographic', 'Health status',
- 'Diet', 'Other lifestyle behaviors']
- _train_suffix = '9916'
- _test_suffix = '1618'
+ modality_keys = [
+ "Sociodemographic",
+ "Health status",
+ "Diet",
+ "Other lifestyle behaviors",
+ ]
+ _train_suffix = "9916"
+ _test_suffix = "1618"
  X_train = {k: _load_csv(_file_path, k, _train_suffix) for k in modality_keys}
  X_test = {k: _load_csv(_file_path, k, _test_suffix) for k in modality_keys}
- y_train = _load_csv(_file_path, 'outcomes_label', _train_suffix)
- y_test = _load_csv(_file_path, 'outcomes_label', _test_suffix)
+ y_train = _load_csv(_file_path, "outcomes_label", _train_suffix)
+ y_test = _load_csv(_file_path, "outcomes_label", _test_suffix)
  dictionary = pd.read_csv(join(_file_path, "data_dictionary.csv"))
 
- return {'X_train': X_train,
- 'y_train': y_train,
- 'X_test': X_test,
- 'y_test': y_test,
- 'data_dict': dictionary}
+ return {
+ "X_train": X_train,
+ "y_train": y_train,
+ "X_test": X_test,
+ "y_test": y_test,
+ "data_dict": dictionary,
+ }
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
  loaded_dictionary = load_diabetes()
- print(loaded_dictionary['X_train'])
+ print(loaded_dictionary["X_train"])
diff --git a/eipy/ei.py b/eipy/ei.py
@@ -100,7 +100,8 @@ class EnsembleIntegration:
  Training data for ensemble methods, for each outer fold.
  len(ensemble_training_data) = len(k_outer)
  ensemble_test_data : list of pandas.DataFrame
- Test data for ensemble methods, for each outer fold. len(ensemble_test_data) = len(k_outer)
+ Test data for ensemble methods, for each outer fold.
+ len(ensemble_test_data) = len(k_outer)
  ensemble_predictions : pandas.DataFrame
  Combined predictions (across all outer folds) made by each ensemble method.
  modality_names : list of str
@@ -161,7 +162,10 @@ def __init__(
  self.model_building = model_building
  self.verbose = verbose
 
- self.final_models = {"base models": {}, "ensemble models": {}} # for final model
+ self.final_models = {
+ "base models": {},
+ "ensemble models": {},
+ } # for final model
  self.ensemble_training_data_final = None # for final model
 
  self.cv_outer = StratifiedKFold(
@@ -289,7 +293,9 @@ def fit_ensemble(self, ensemble_predictors=None):
  ensemble_predictions["labels"] = y_test_combined
 
  self.ensemble_predictions = pd.DataFrame.from_dict(ensemble_predictions)
- self.ensemble_summary = ensemble_summary(self.ensemble_predictions, self.metrics)
+ self.ensemble_summary = ensemble_summary(
+ self.ensemble_predictions, self.metrics
+ )
 
  if self.model_building:
  for model_name, model in tqdm(
@@ -360,7 +366,9 @@ def predict(self, X_dict, ensemble_model_key):
  ensemble_prediction_data[0].T.groupby(level=[0, 1]).mean().T
  )
 
- ensemble_model = pickle.loads(self.final_models["ensemble models"][ensemble_model_key])
+ ensemble_model = pickle.loads(
+ self.final_models["ensemble models"][ensemble_model_key]
+ )
 
  y_pred = safe_predict_proba(ensemble_model, ensemble_prediction_data)
  return y_pred

diff --git a/eipy/interpretation.py b/eipy/interpretation.py
@@ -29,9 +29,9 @@ class PermutationInterpreter:
  n_repeats : int, default=10
  Number of repeats in PermutationImportance.
  ensemble_predictor_keys: default='all'
- Ensemble predictor keys used in EnsembleIntegration. If 'all' then all ensemble predictors
- seen by EI are interpreted. Recommended to pass a subset of ensemble_predctor keys as
- a list.
+ Ensemble predictor keys used in EnsembleIntegration. If 'all' then all
+ ensemble predictors seen by EI are interpreted. Recommended to pass a
+ subset of ensemble_predctor keys as a list.
  metric_greater_is_better: default=True
  Metric greater is better.
 

diff --git a/eipy/metrics.py b/eipy/metrics.py
@@ -161,7 +161,7 @@ def ensemble_summary(ensemble_predictions, metrics):
  return create_metric_threshold_dict(X, labels, metrics)
 
 
-# These two functions are an attempt at maximizing/minimizing any metric but they were fairly slow
+# These two functions are an attempt at maximizing/minimizing any metric
 # def metric_scaler_function(arg, y_true, y_pred, metric, pos_label, multiplier):
 # threshold = np.sort(np.unique(y_pred))[int(np.round(arg))]
 # y_binary = (y_pred >= threshold).astype(int)

diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,7 @@ shap = ">=0.42"
 xgboost = ">=1.7"
 pandoc = "^2.3"
 dill = "^0.3.7"
+wget = "^3.2"
 
 [tool.poetry.group.dev.dependencies]
 pytest = ">=6.0"

diff --git a/tests/test_load_data.py b/tests/test_load_data.py
@@ -0,0 +1,5 @@
+from eipy.datasets import load_diabetes
+import pytest
+
+def test_load_diabetes():
+ data = load_diabetes()