From be2f72b4e20148714edecead0201b65574b11f17 Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Wed, 29 Jul 2020 17:21:54 -0400
Subject: [PATCH 1/3] Add web-scraping script to determine dependents

includes filtering to those specific to biology/medicine
---
 skimage_filter_dependents.py | 347 +++++++++++++++++++++++++++++++++++
 1 file changed, 347 insertions(+)
 create mode 100644 skimage_filter_dependents.py

diff --git a/skimage_filter_dependents.py b/skimage_filter_dependents.py
new file mode 100644
index 0000000..ea483ff
--- /dev/null
+++ b/skimage_filter_dependents.py
@@ -0,0 +1,347 @@
+"""
+At the time this script was created (July 2020), GitHub did not offer an
+official way to query the dependent packages through their API. So, we instead
+use a web-scraping approach via BeautifulSoup, patterned after a response
+in this stack-overflow thread:
+https://stackoverflow.com/questions/58734176/how-to-use-github-api-to-get-a-repositorys-dependents-information-in-github
+
+To retrieve topic lists via the GitHub API, the user must have defined a
+GITHUB_TOKEN environment variable.
+
+This script generates three lists of packages:
+
+1.) One that has ALL dependents that are active repositories (i.e. no "Ghost"
+icon in the web page).
+2.) One list that only retains packages with >= min_stars stars, but also
+includes a list of the GitHub "topics" associated with each package.
+3.) A third list that is based on filtering the second list. During filtering, a
+package is retained if either:
+    a.) Any string from repo_name_terms is in the repository organization/name
+    b.) A topic in the repo's topic lists matches a topic in topic_search_terms
+
+The three variables containing the lists described above are:
+
+Outputs
+-------
+all_packages : list of tuple
+    Each element is a (name, forks, stars) tuple.
+popular_packages : list of tuple
+    Each element is a (name, forks, stars, topics) tuple.
+popular_filtered_packages : list, tuple
+    Each element is a (name, forks, stars, topics) tuple.
+"""
+
+import os
+import pickle
+
+from bs4 import BeautifulSoup
+from github import Github
+import pandas
+import requests
+
+# we use PyGitHub to retrieve topic lists
+token = os.environ['GITHUB_TOKEN']
+g = Github(token)
+
+# ----------------------------------
+# START OF USER-CONFIGURABLE OPTIONS
+# ----------------------------------
+
+# The repository we will query for it's dependents
+repo_to_query = "scikit-image/scikit-image"
+
+# Retrieve detailed topic lists only for the packages with >= min_stars stars.
+min_stars = 5
+
+# If True, will write the three lists to .pickle files in the current directory
+save_to_pickle = False
+# If True, will write the three lists to .csv files in the current directory
+save_to_csv = True
+
+
+
+# Search terms of interest in the repository organization/name.
+# (see description at top)
+# All terms should be in lower case.
+repo_name_terms = [
+    'brain',
+    'cell',
+    'ecg',
+    'eeg',
+    'medi',
+    'mri',
+    'neuro',
+    'pathol',
+    'retin',
+    'slide',
+    'spectro',
+    'tissue',
+    'tomo',
+]
+
+# Search terms of interest in the repository's topics (see description at top).
+# This list was created to match bio-image applications by manually curating
+# topic names from the full list of packages.
+topic_search_terms = [
+    'airways',
+    'anatomy',
+    'arteries',
+    'astrocytes',
+    'atomic-force-microscopy',
+    'afm',
+    'axon',
+    'bioimage-informatics',
+    'bioinformatics',
+    'biologists',
+    'biomedical-image-processing',
+    'bionic-vision',
+    'biophysics',
+    'brain-connectivity',
+    'brain-imaging',
+    'brain-mri',
+    'brain-tumor-segmentation',
+    'brats',
+    'calcium',
+    'cancer-research',
+    'cell-biology',
+    'cell-detection',
+    'cell-segmentation',
+    'computational-pathology',
+    'connectome',
+    'connectomics',
+    'cryo-em',
+    'ct-data',
+    'deconvolution-microscopy',
+    'dicom',
+    'dicom-rt',
+    'digital-pathology-data',
+    'digital-pathology',
+    'digital-slide-archive',
+    'dmri',
+    'electron-microscopy',
+    'electrophysiology',
+    'fluorescence',
+    'fluorescence-microscopy-imaging',
+    'fmri',
+    'fmri-preprocessing',
+    'functional-connectomes',
+    'healthcare-imaging',
+    'histology',
+    'voxel',
+    'microorganism-colonies',
+    'microscopy',
+    'microscopy-images',
+    'neuroimaging',
+    'medical',
+    'medical-image-computing',
+    'medical-image-processing',
+    'medical-images',
+    'medical-imaging',
+    'mri',
+    'myelin',
+    'neural-engineering',
+    'neuroanatomy',
+    'neuroimaging',
+    'neuroimaging-analysis',
+    'neuropoly',
+    'neuroscience',
+    'nih-brain-initiative',
+    'openslide',
+    'pathology',
+    'pathology-image',
+    'radiation-oncology',
+    'radiation-physics',
+    'raman',
+    'retinal-implants',
+    'scanning-probe-microscopy',
+    'scanning-tunnelling-microscopy',
+    'single-cell-imaging',
+    'slide-images',
+    'spectroscopy',
+    'spinalcord',
+    'stm',
+    'stem',
+    'stitching',
+    'structural-connectomes',
+    'tissue-localization',
+    'tomography',
+    'volumetric-images',
+    'whole-slide-image',
+    'whole-slide-imaging',
+]
+
+# Omit the following repositories from the filtered list.
+# These match at least one of the search terms above, but do not appear to be
+# biology-focused. (e.g. the term "cell" appears in "Marcello").
+omit_list = [
+    'Marcello-Sega/pytim',
+    'PMEAL/porespy'
+]
+
+# --------------------------------
+# END OF USER-CONFIGURABLE OPTIONS
+# --------------------------------
+
+# Parse at most this many web pages.
+# Parsing should automatically stop when reaching the last page.
+max_page_num = 100
+
+packages = True
+url = 'https://github.com/{}/network/dependents?dependent_type=PACKAGE'.format(repo_to_query)
+
+package_list = []
+ghost_list = []
+prev_len = 0
+for i in range(max_page_num):
+    # retrieve HTML for the current URL
+    print("GET " + url)
+    r = requests.get(url)
+    soup = BeautifulSoup(r.content, "html.parser")
+
+    page_package_list = []
+    page_ghost_list = []
+    for t in soup.findAll("div", {"class": "Box-row"}):
+        try:
+            # find repository org/name
+            name = "{}/{}".format(
+                t.find('a', {"data-repository-hovercards-enabled":""}).text,
+                t.find('a', {"data-hovercard-type":"repository"}).text
+            )
+        except AttributeError:
+            # Ghost repositories will give None for the find() calls above.
+            # This results in an AttributeError when trying to access .text
+            page_ghost_list.append(t.text)
+            continue
+
+        # extract the number of stars
+        stars = 'unknown'
+        for span in t.find_all('span', attrs={'class': 'text-gray-light'}):
+            svg_star = span.find_all('svg', attrs={'class': 'octicon-star'})
+            if svg_star:
+                # replace ","" in e.g. "1,000" before casting to int
+                stars = int(span.text.strip().replace(",", ""))
+                break
+
+        # extract the number of forks
+        forks = 'unknown'
+        for span in t.find_all('span', attrs={'class': 'text-gray-light'}):
+            svg_fork = span.find_all('svg',
+                                     attrs={'class': 'octicon-repo-forked'})
+            if svg_fork:
+                # replace ","" in e.g. "1,000" before casting to int
+                forks = int(span.text.strip().replace(",", ""))
+                break
+
+        page_package_list.append((name, forks, stars))
+
+
+    # append packages from the current page to the overall lists
+    package_list = package_list + page_package_list
+    ghost_list = ghost_list + page_ghost_list
+
+    # remove any duplicates
+    package_list = list(set(package_list))
+    ghost_list = list(set(ghost_list))
+
+    # terminate if no change from the prior URL
+    new_len = len(package_list) + len(ghost_list)
+    if new_len == prev_len:
+        print("no change in package lists... stopping scraping")
+        break
+    prev_len = new_len
+
+    # find the URL for the "Next" page of packages
+    paginationContainers = soup.find("div", {"class":"paginate-container"}).find_all('a')
+    url = None
+    for paginationContainer in paginationContainers:
+        # Make sure we are retrieving the "Next" page and not the "Previous"
+        if paginationContainer.text == "Next":
+            url = paginationContainer["href"]
+    if url is None:
+        print("No additional next page found, ... stopping scraping")
+        break
+
+# sort by descending number of stars
+# This is the first list mentioned at the top.
+all_packages = sorted(package_list, key=lambda x:x[2], reverse=True)
+
+# Create the second list by retaining only those with >= min_stars
+# Note that in the package list, the tuple is:
+#   (name, # of forks, # of stars)
+_popular_packages = [p for p in all_packages if p[2] >= min_stars]
+n_popular = len(_popular_packages)
+
+# add a 4th term to each tuple, containing the GitHub topic list
+popular_packages = []
+
+for n, p in enumerate(_popular_packages):
+    print("Retrieving topics for package {} of {}".format(n + 1, n_popular))
+    repo_name = p[0]
+    repo = g.get_repo(repo_name)
+    topics = repo.get_topics()
+    popular_packages.append(p + (topics,))
+
+print("Applying filtering")
+popular_filtered_packages = []
+for p in popular_packages:
+    name = p[0]
+    name_lower = name.lower()
+    if name in omit_list:
+        continue
+    topics = p[3]
+    keep = False  # unless we match a term below, we will exclude the package
+
+    # check match based on repository organization/name
+    for m in repo_name_terms:
+        if m in name_lower:
+            keep = True
+            break
+
+    # If not already a match, search based on topic search terms
+    if not keep:
+        for topic in topics:
+            if topic in topic_search_terms:
+                keep = True
+                break
+    if keep:
+        popular_filtered_packages.append(p)
+
+# dump output lists to pickle
+fname_base = repo_to_query.replace('/', '_')
+if save_to_pickle:
+    print("Writing pickle files")
+
+    os.chdir('/media/lee8rx/data/Dropbox/Dropbox/Grants/CZI')
+    with open(fname_base + '_all_packages.pickle', 'wb') as f:
+        pickle.dump(all_packages, f)
+
+    with open(fname_base + '_popular_packages.pickle', 'wb') as f:
+        pickle.dump(popular_packages, f)
+
+    with open(fname_base + '_popular_filtered_packages.pickle', 'wb') as f:
+        pickle.dump(popular_filtered_packages, f)
+
+if save_to_csv:
+    print("Writing CSV files")
+    df_all = pandas.DataFrame(
+        all_packages,
+        columns=('name', '# of forks', '# of stars')
+    )
+    df_all = df_all.set_index('name')
+    df_all.to_csv(fname_base + '_all_dependents.csv')
+
+    df_popular = pandas.DataFrame(
+        popular_packages,
+        columns=('name', '# of forks', '# of stars', 'topics')
+    )
+    df_popular = df_popular.set_index('name')
+    df_popular.to_csv(fname_base + '_popular_dependents.csv')
+
+    df_filtered_popular = pandas.DataFrame(
+        popular_filtered_packages,
+        columns=('name', '# of forks', '# of stars', 'topics')
+    )
+    df_filtered_popular = df_filtered_popular.set_index('name')
+    df_filtered_popular.to_csv(fname_base + '_filtered_dependents.csv')
+
+    # print(df_filtered_popular.to_markdown())

From 50a3257cdce9a11393305ace6547445e6bee4ddb Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Wed, 29 Jul 2020 22:14:29 -0400
Subject: [PATCH 2/3] pep8 fixes

---
 skimage_filter_dependents.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/skimage_filter_dependents.py b/skimage_filter_dependents.py
index ea483ff..e29c73d 100644
--- a/skimage_filter_dependents.py
+++ b/skimage_filter_dependents.py
@@ -14,8 +14,8 @@
 icon in the web page).
 2.) One list that only retains packages with >= min_stars stars, but also
 includes a list of the GitHub "topics" associated with each package.
-3.) A third list that is based on filtering the second list. During filtering, a
-package is retained if either:
+3.) A third list that is based on filtering the second list. During filtering,
+a package is retained if either:
     a.) Any string from repo_name_terms is in the repository organization/name
     b.) A topic in the repo's topic lists matches a topic in topic_search_terms
 
@@ -58,8 +58,6 @@
 # If True, will write the three lists to .csv files in the current directory
 save_to_csv = True
 
-
-
 # Search terms of interest in the repository organization/name.
 # (see description at top)
 # All terms should be in lower case.
@@ -187,7 +185,8 @@
 max_page_num = 100
 
 packages = True
-url = 'https://github.com/{}/network/dependents?dependent_type=PACKAGE'.format(repo_to_query)
+url = ('https://github.com/{}/network/dependents'
+       '?dependent_type=PACKAGE').format(repo_to_query)
 
 package_list = []
 ghost_list = []
@@ -204,8 +203,8 @@
         try:
             # find repository org/name
             name = "{}/{}".format(
-                t.find('a', {"data-repository-hovercards-enabled":""}).text,
-                t.find('a', {"data-hovercard-type":"repository"}).text
+                t.find('a', {"data-repository-hovercards-enabled": ""}).text,
+                t.find('a', {"data-hovercard-type": "repository"}).text
             )
         except AttributeError:
             # Ghost repositories will give None for the find() calls above.
@@ -234,7 +233,6 @@
 
         page_package_list.append((name, forks, stars))
 
-
     # append packages from the current page to the overall lists
     package_list = package_list + page_package_list
     ghost_list = ghost_list + page_ghost_list
@@ -251,7 +249,8 @@
     prev_len = new_len
 
     # find the URL for the "Next" page of packages
-    paginationContainers = soup.find("div", {"class":"paginate-container"}).find_all('a')
+    paginationContainers = soup.find(
+        "div", {"class": "paginate-container"}).find_all('a')
     url = None
     for paginationContainer in paginationContainers:
         # Make sure we are retrieving the "Next" page and not the "Previous"
@@ -263,7 +262,7 @@
 
 # sort by descending number of stars
 # This is the first list mentioned at the top.
-all_packages = sorted(package_list, key=lambda x:x[2], reverse=True)
+all_packages = sorted(package_list, key=lambda x: x[2], reverse=True)
 
 # Create the second list by retaining only those with >= min_stars
 # Note that in the package list, the tuple is:

From 9695022729bff6e93763c63975830d5ae643d95b Mon Sep 17 00:00:00 2001
From: "Gregory R. Lee" <grlee77@gmail.com>
Date: Wed, 30 Sep 2020 15:09:28 -0400
Subject: [PATCH 3/3] Apply suggestions from code review

Co-authored-by: Marianne Corvellec <marianne.corvellec@ens-lyon.org>
---
 skimage_filter_dependents.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/skimage_filter_dependents.py b/skimage_filter_dependents.py
index e29c73d..d37e752 100644
--- a/skimage_filter_dependents.py
+++ b/skimage_filter_dependents.py
@@ -1,7 +1,7 @@
 """
 At the time this script was created (July 2020), GitHub did not offer an
-official way to query the dependent packages through their API. So, we instead
-use a web-scraping approach via BeautifulSoup, patterned after a response
+official way to query dependent packages through their API. So, we went
+for a web-scraping approach using BeautifulSoup, patterned after a response
 in this stack-overflow thread:
 https://stackoverflow.com/questions/58734176/how-to-use-github-api-to-get-a-repositorys-dependents-information-in-github
 
@@ -12,12 +12,12 @@
 
 1.) One that has ALL dependents that are active repositories (i.e. no "Ghost"
 icon in the web page).
-2.) One list that only retains packages with >= min_stars stars, but also
+2.) Another one that only retains packages with >= min_stars stars, but also
 includes a list of the GitHub "topics" associated with each package.
 3.) A third list that is based on filtering the second list. During filtering,
 a package is retained if either:
     a.) Any string from repo_name_terms is in the repository organization/name
-    b.) A topic in the repo's topic lists matches a topic in topic_search_terms
+    b.) A topic in the repo's topic list matches a topic in topic_search_terms
 
 The three variables containing the lists described above are:
 
@@ -47,10 +47,10 @@
 # START OF USER-CONFIGURABLE OPTIONS
 # ----------------------------------
 
-# The repository we will query for it's dependents
+# The repository we will query (whose dependents we want to find)
 repo_to_query = "scikit-image/scikit-image"
 
-# Retrieve detailed topic lists only for the packages with >= min_stars stars.
+# Retrieve detailed topic lists only for packages with >= min_stars stars.
 min_stars = 5
 
 # If True, will write the three lists to .pickle files in the current directory
@@ -79,7 +79,7 @@
 
 # Search terms of interest in the repository's topics (see description at top).
 # This list was created to match bio-image applications by manually curating
-# topic names from the full list of packages.
+# topic names from the full list of dependent packages.
 topic_search_terms = [
     'airways',
     'anatomy',