From be2f72b4e20148714edecead0201b65574b11f17 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Wed, 29 Jul 2020 17:21:54 -0400 Subject: [PATCH 1/3] Add web-scraping script to determine dependents includes filtering to those specific to biology/medicine --- skimage_filter_dependents.py | 347 +++++++++++++++++++++++++++++++++++ 1 file changed, 347 insertions(+) create mode 100644 skimage_filter_dependents.py diff --git a/skimage_filter_dependents.py b/skimage_filter_dependents.py new file mode 100644 index 0000000..ea483ff --- /dev/null +++ b/skimage_filter_dependents.py @@ -0,0 +1,347 @@ +""" +At the time this script was created (July 2020), GitHub did not offer an +official way to query the dependent packages through their API. So, we instead +use a web-scraping approach via BeautifulSoup, patterned after a response +in this stack-overflow thread: +https://stackoverflow.com/questions/58734176/how-to-use-github-api-to-get-a-repositorys-dependents-information-in-github + +To retrieve topic lists via the GitHub API, the user must have defined a +GITHUB_TOKEN environment variable. + +This script generates three lists of packages: + +1.) One that has ALL dependents that are active repositories (i.e. no "Ghost" +icon in the web page). +2.) One list that only retains packages with >= min_stars stars, but also +includes a list of the GitHub "topics" associated with each package. +3.) A third list that is based on filtering the second list. During filtering, a +package is retained if either: + a.) Any string from repo_name_terms is in the repository organization/name + b.) A topic in the repo's topic lists matches a topic in topic_search_terms + +The three variables containing the lists described above are: + +Outputs +------- +all_packages : list of tuple + Each element is a (name, forks, stars) tuple. +popular_packages : list of tuple + Each element is a (name, forks, stars, topics) tuple. +popular_filtered_packages : list, tuple + Each element is a (name, forks, stars, topics) tuple. +""" + +import os +import pickle + +from bs4 import BeautifulSoup +from github import Github +import pandas +import requests + +# we use PyGitHub to retrieve topic lists +token = os.environ['GITHUB_TOKEN'] +g = Github(token) + +# ---------------------------------- +# START OF USER-CONFIGURABLE OPTIONS +# ---------------------------------- + +# The repository we will query for it's dependents +repo_to_query = "scikit-image/scikit-image" + +# Retrieve detailed topic lists only for the packages with >= min_stars stars. +min_stars = 5 + +# If True, will write the three lists to .pickle files in the current directory +save_to_pickle = False +# If True, will write the three lists to .csv files in the current directory +save_to_csv = True + + + +# Search terms of interest in the repository organization/name. +# (see description at top) +# All terms should be in lower case. +repo_name_terms = [ + 'brain', + 'cell', + 'ecg', + 'eeg', + 'medi', + 'mri', + 'neuro', + 'pathol', + 'retin', + 'slide', + 'spectro', + 'tissue', + 'tomo', +] + +# Search terms of interest in the repository's topics (see description at top). +# This list was created to match bio-image applications by manually curating +# topic names from the full list of packages. +topic_search_terms = [ + 'airways', + 'anatomy', + 'arteries', + 'astrocytes', + 'atomic-force-microscopy', + 'afm', + 'axon', + 'bioimage-informatics', + 'bioinformatics', + 'biologists', + 'biomedical-image-processing', + 'bionic-vision', + 'biophysics', + 'brain-connectivity', + 'brain-imaging', + 'brain-mri', + 'brain-tumor-segmentation', + 'brats', + 'calcium', + 'cancer-research', + 'cell-biology', + 'cell-detection', + 'cell-segmentation', + 'computational-pathology', + 'connectome', + 'connectomics', + 'cryo-em', + 'ct-data', + 'deconvolution-microscopy', + 'dicom', + 'dicom-rt', + 'digital-pathology-data', + 'digital-pathology', + 'digital-slide-archive', + 'dmri', + 'electron-microscopy', + 'electrophysiology', + 'fluorescence', + 'fluorescence-microscopy-imaging', + 'fmri', + 'fmri-preprocessing', + 'functional-connectomes', + 'healthcare-imaging', + 'histology', + 'voxel', + 'microorganism-colonies', + 'microscopy', + 'microscopy-images', + 'neuroimaging', + 'medical', + 'medical-image-computing', + 'medical-image-processing', + 'medical-images', + 'medical-imaging', + 'mri', + 'myelin', + 'neural-engineering', + 'neuroanatomy', + 'neuroimaging', + 'neuroimaging-analysis', + 'neuropoly', + 'neuroscience', + 'nih-brain-initiative', + 'openslide', + 'pathology', + 'pathology-image', + 'radiation-oncology', + 'radiation-physics', + 'raman', + 'retinal-implants', + 'scanning-probe-microscopy', + 'scanning-tunnelling-microscopy', + 'single-cell-imaging', + 'slide-images', + 'spectroscopy', + 'spinalcord', + 'stm', + 'stem', + 'stitching', + 'structural-connectomes', + 'tissue-localization', + 'tomography', + 'volumetric-images', + 'whole-slide-image', + 'whole-slide-imaging', +] + +# Omit the following repositories from the filtered list. +# These match at least one of the search terms above, but do not appear to be +# biology-focused. (e.g. the term "cell" appears in "Marcello"). +omit_list = [ + 'Marcello-Sega/pytim', + 'PMEAL/porespy' +] + +# -------------------------------- +# END OF USER-CONFIGURABLE OPTIONS +# -------------------------------- + +# Parse at most this many web pages. +# Parsing should automatically stop when reaching the last page. +max_page_num = 100 + +packages = True +url = 'https://github.com/{}/network/dependents?dependent_type=PACKAGE'.format(repo_to_query) + +package_list = [] +ghost_list = [] +prev_len = 0 +for i in range(max_page_num): + # retrieve HTML for the current URL + print("GET " + url) + r = requests.get(url) + soup = BeautifulSoup(r.content, "html.parser") + + page_package_list = [] + page_ghost_list = [] + for t in soup.findAll("div", {"class": "Box-row"}): + try: + # find repository org/name + name = "{}/{}".format( + t.find('a', {"data-repository-hovercards-enabled":""}).text, + t.find('a', {"data-hovercard-type":"repository"}).text + ) + except AttributeError: + # Ghost repositories will give None for the find() calls above. + # This results in an AttributeError when trying to access .text + page_ghost_list.append(t.text) + continue + + # extract the number of stars + stars = 'unknown' + for span in t.find_all('span', attrs={'class': 'text-gray-light'}): + svg_star = span.find_all('svg', attrs={'class': 'octicon-star'}) + if svg_star: + # replace ","" in e.g. "1,000" before casting to int + stars = int(span.text.strip().replace(",", "")) + break + + # extract the number of forks + forks = 'unknown' + for span in t.find_all('span', attrs={'class': 'text-gray-light'}): + svg_fork = span.find_all('svg', + attrs={'class': 'octicon-repo-forked'}) + if svg_fork: + # replace ","" in e.g. "1,000" before casting to int + forks = int(span.text.strip().replace(",", "")) + break + + page_package_list.append((name, forks, stars)) + + + # append packages from the current page to the overall lists + package_list = package_list + page_package_list + ghost_list = ghost_list + page_ghost_list + + # remove any duplicates + package_list = list(set(package_list)) + ghost_list = list(set(ghost_list)) + + # terminate if no change from the prior URL + new_len = len(package_list) + len(ghost_list) + if new_len == prev_len: + print("no change in package lists... stopping scraping") + break + prev_len = new_len + + # find the URL for the "Next" page of packages + paginationContainers = soup.find("div", {"class":"paginate-container"}).find_all('a') + url = None + for paginationContainer in paginationContainers: + # Make sure we are retrieving the "Next" page and not the "Previous" + if paginationContainer.text == "Next": + url = paginationContainer["href"] + if url is None: + print("No additional next page found, ... stopping scraping") + break + +# sort by descending number of stars +# This is the first list mentioned at the top. +all_packages = sorted(package_list, key=lambda x:x[2], reverse=True) + +# Create the second list by retaining only those with >= min_stars +# Note that in the package list, the tuple is: +# (name, # of forks, # of stars) +_popular_packages = [p for p in all_packages if p[2] >= min_stars] +n_popular = len(_popular_packages) + +# add a 4th term to each tuple, containing the GitHub topic list +popular_packages = [] + +for n, p in enumerate(_popular_packages): + print("Retrieving topics for package {} of {}".format(n + 1, n_popular)) + repo_name = p[0] + repo = g.get_repo(repo_name) + topics = repo.get_topics() + popular_packages.append(p + (topics,)) + +print("Applying filtering") +popular_filtered_packages = [] +for p in popular_packages: + name = p[0] + name_lower = name.lower() + if name in omit_list: + continue + topics = p[3] + keep = False # unless we match a term below, we will exclude the package + + # check match based on repository organization/name + for m in repo_name_terms: + if m in name_lower: + keep = True + break + + # If not already a match, search based on topic search terms + if not keep: + for topic in topics: + if topic in topic_search_terms: + keep = True + break + if keep: + popular_filtered_packages.append(p) + +# dump output lists to pickle +fname_base = repo_to_query.replace('/', '_') +if save_to_pickle: + print("Writing pickle files") + + os.chdir('/media/lee8rx/data/Dropbox/Dropbox/Grants/CZI') + with open(fname_base + '_all_packages.pickle', 'wb') as f: + pickle.dump(all_packages, f) + + with open(fname_base + '_popular_packages.pickle', 'wb') as f: + pickle.dump(popular_packages, f) + + with open(fname_base + '_popular_filtered_packages.pickle', 'wb') as f: + pickle.dump(popular_filtered_packages, f) + +if save_to_csv: + print("Writing CSV files") + df_all = pandas.DataFrame( + all_packages, + columns=('name', '# of forks', '# of stars') + ) + df_all = df_all.set_index('name') + df_all.to_csv(fname_base + '_all_dependents.csv') + + df_popular = pandas.DataFrame( + popular_packages, + columns=('name', '# of forks', '# of stars', 'topics') + ) + df_popular = df_popular.set_index('name') + df_popular.to_csv(fname_base + '_popular_dependents.csv') + + df_filtered_popular = pandas.DataFrame( + popular_filtered_packages, + columns=('name', '# of forks', '# of stars', 'topics') + ) + df_filtered_popular = df_filtered_popular.set_index('name') + df_filtered_popular.to_csv(fname_base + '_filtered_dependents.csv') + + # print(df_filtered_popular.to_markdown()) From 50a3257cdce9a11393305ace6547445e6bee4ddb Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Wed, 29 Jul 2020 22:14:29 -0400 Subject: [PATCH 2/3] pep8 fixes --- skimage_filter_dependents.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/skimage_filter_dependents.py b/skimage_filter_dependents.py index ea483ff..e29c73d 100644 --- a/skimage_filter_dependents.py +++ b/skimage_filter_dependents.py @@ -14,8 +14,8 @@ icon in the web page). 2.) One list that only retains packages with >= min_stars stars, but also includes a list of the GitHub "topics" associated with each package. -3.) A third list that is based on filtering the second list. During filtering, a -package is retained if either: +3.) A third list that is based on filtering the second list. During filtering, +a package is retained if either: a.) Any string from repo_name_terms is in the repository organization/name b.) A topic in the repo's topic lists matches a topic in topic_search_terms @@ -58,8 +58,6 @@ # If True, will write the three lists to .csv files in the current directory save_to_csv = True - - # Search terms of interest in the repository organization/name. # (see description at top) # All terms should be in lower case. @@ -187,7 +185,8 @@ max_page_num = 100 packages = True -url = 'https://github.com/{}/network/dependents?dependent_type=PACKAGE'.format(repo_to_query) +url = ('https://github.com/{}/network/dependents' + '?dependent_type=PACKAGE').format(repo_to_query) package_list = [] ghost_list = [] @@ -204,8 +203,8 @@ try: # find repository org/name name = "{}/{}".format( - t.find('a', {"data-repository-hovercards-enabled":""}).text, - t.find('a', {"data-hovercard-type":"repository"}).text + t.find('a', {"data-repository-hovercards-enabled": ""}).text, + t.find('a', {"data-hovercard-type": "repository"}).text ) except AttributeError: # Ghost repositories will give None for the find() calls above. @@ -234,7 +233,6 @@ page_package_list.append((name, forks, stars)) - # append packages from the current page to the overall lists package_list = package_list + page_package_list ghost_list = ghost_list + page_ghost_list @@ -251,7 +249,8 @@ prev_len = new_len # find the URL for the "Next" page of packages - paginationContainers = soup.find("div", {"class":"paginate-container"}).find_all('a') + paginationContainers = soup.find( + "div", {"class": "paginate-container"}).find_all('a') url = None for paginationContainer in paginationContainers: # Make sure we are retrieving the "Next" page and not the "Previous" @@ -263,7 +262,7 @@ # sort by descending number of stars # This is the first list mentioned at the top. -all_packages = sorted(package_list, key=lambda x:x[2], reverse=True) +all_packages = sorted(package_list, key=lambda x: x[2], reverse=True) # Create the second list by retaining only those with >= min_stars # Note that in the package list, the tuple is: From 9695022729bff6e93763c63975830d5ae643d95b Mon Sep 17 00:00:00 2001 From: "Gregory R. Lee" Date: Wed, 30 Sep 2020 15:09:28 -0400 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Marianne Corvellec --- skimage_filter_dependents.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/skimage_filter_dependents.py b/skimage_filter_dependents.py index e29c73d..d37e752 100644 --- a/skimage_filter_dependents.py +++ b/skimage_filter_dependents.py @@ -1,7 +1,7 @@ """ At the time this script was created (July 2020), GitHub did not offer an -official way to query the dependent packages through their API. So, we instead -use a web-scraping approach via BeautifulSoup, patterned after a response +official way to query dependent packages through their API. So, we went +for a web-scraping approach using BeautifulSoup, patterned after a response in this stack-overflow thread: https://stackoverflow.com/questions/58734176/how-to-use-github-api-to-get-a-repositorys-dependents-information-in-github @@ -12,12 +12,12 @@ 1.) One that has ALL dependents that are active repositories (i.e. no "Ghost" icon in the web page). -2.) One list that only retains packages with >= min_stars stars, but also +2.) Another one that only retains packages with >= min_stars stars, but also includes a list of the GitHub "topics" associated with each package. 3.) A third list that is based on filtering the second list. During filtering, a package is retained if either: a.) Any string from repo_name_terms is in the repository organization/name - b.) A topic in the repo's topic lists matches a topic in topic_search_terms + b.) A topic in the repo's topic list matches a topic in topic_search_terms The three variables containing the lists described above are: @@ -47,10 +47,10 @@ # START OF USER-CONFIGURABLE OPTIONS # ---------------------------------- -# The repository we will query for it's dependents +# The repository we will query (whose dependents we want to find) repo_to_query = "scikit-image/scikit-image" -# Retrieve detailed topic lists only for the packages with >= min_stars stars. +# Retrieve detailed topic lists only for packages with >= min_stars stars. min_stars = 5 # If True, will write the three lists to .pickle files in the current directory @@ -79,7 +79,7 @@ # Search terms of interest in the repository's topics (see description at top). # This list was created to match bio-image applications by manually curating -# topic names from the full list of packages. +# topic names from the full list of dependent packages. topic_search_terms = [ 'airways', 'anatomy',