diff --git a/skimage_filter_dependents.py b/skimage_filter_dependents.py new file mode 100644 index 0000000..d37e752 --- /dev/null +++ b/skimage_filter_dependents.py @@ -0,0 +1,346 @@ +""" +At the time this script was created (July 2020), GitHub did not offer an +official way to query dependent packages through their API. So, we went +for a web-scraping approach using BeautifulSoup, patterned after a response +in this stack-overflow thread: +https://stackoverflow.com/questions/58734176/how-to-use-github-api-to-get-a-repositorys-dependents-information-in-github + +To retrieve topic lists via the GitHub API, the user must have defined a +GITHUB_TOKEN environment variable. + +This script generates three lists of packages: + +1.) One that has ALL dependents that are active repositories (i.e. no "Ghost" +icon in the web page). +2.) Another one that only retains packages with >= min_stars stars, but also +includes a list of the GitHub "topics" associated with each package. +3.) A third list that is based on filtering the second list. During filtering, +a package is retained if either: + a.) Any string from repo_name_terms is in the repository organization/name + b.) A topic in the repo's topic list matches a topic in topic_search_terms + +The three variables containing the lists described above are: + +Outputs +------- +all_packages : list of tuple + Each element is a (name, forks, stars) tuple. +popular_packages : list of tuple + Each element is a (name, forks, stars, topics) tuple. +popular_filtered_packages : list, tuple + Each element is a (name, forks, stars, topics) tuple. +""" + +import os +import pickle + +from bs4 import BeautifulSoup +from github import Github +import pandas +import requests + +# we use PyGitHub to retrieve topic lists +token = os.environ['GITHUB_TOKEN'] +g = Github(token) + +# ---------------------------------- +# START OF USER-CONFIGURABLE OPTIONS +# ---------------------------------- + +# The repository we will query (whose dependents we want to find) +repo_to_query = "scikit-image/scikit-image" + +# Retrieve detailed topic lists only for packages with >= min_stars stars. +min_stars = 5 + +# If True, will write the three lists to .pickle files in the current directory +save_to_pickle = False +# If True, will write the three lists to .csv files in the current directory +save_to_csv = True + +# Search terms of interest in the repository organization/name. +# (see description at top) +# All terms should be in lower case. +repo_name_terms = [ + 'brain', + 'cell', + 'ecg', + 'eeg', + 'medi', + 'mri', + 'neuro', + 'pathol', + 'retin', + 'slide', + 'spectro', + 'tissue', + 'tomo', +] + +# Search terms of interest in the repository's topics (see description at top). +# This list was created to match bio-image applications by manually curating +# topic names from the full list of dependent packages. +topic_search_terms = [ + 'airways', + 'anatomy', + 'arteries', + 'astrocytes', + 'atomic-force-microscopy', + 'afm', + 'axon', + 'bioimage-informatics', + 'bioinformatics', + 'biologists', + 'biomedical-image-processing', + 'bionic-vision', + 'biophysics', + 'brain-connectivity', + 'brain-imaging', + 'brain-mri', + 'brain-tumor-segmentation', + 'brats', + 'calcium', + 'cancer-research', + 'cell-biology', + 'cell-detection', + 'cell-segmentation', + 'computational-pathology', + 'connectome', + 'connectomics', + 'cryo-em', + 'ct-data', + 'deconvolution-microscopy', + 'dicom', + 'dicom-rt', + 'digital-pathology-data', + 'digital-pathology', + 'digital-slide-archive', + 'dmri', + 'electron-microscopy', + 'electrophysiology', + 'fluorescence', + 'fluorescence-microscopy-imaging', + 'fmri', + 'fmri-preprocessing', + 'functional-connectomes', + 'healthcare-imaging', + 'histology', + 'voxel', + 'microorganism-colonies', + 'microscopy', + 'microscopy-images', + 'neuroimaging', + 'medical', + 'medical-image-computing', + 'medical-image-processing', + 'medical-images', + 'medical-imaging', + 'mri', + 'myelin', + 'neural-engineering', + 'neuroanatomy', + 'neuroimaging', + 'neuroimaging-analysis', + 'neuropoly', + 'neuroscience', + 'nih-brain-initiative', + 'openslide', + 'pathology', + 'pathology-image', + 'radiation-oncology', + 'radiation-physics', + 'raman', + 'retinal-implants', + 'scanning-probe-microscopy', + 'scanning-tunnelling-microscopy', + 'single-cell-imaging', + 'slide-images', + 'spectroscopy', + 'spinalcord', + 'stm', + 'stem', + 'stitching', + 'structural-connectomes', + 'tissue-localization', + 'tomography', + 'volumetric-images', + 'whole-slide-image', + 'whole-slide-imaging', +] + +# Omit the following repositories from the filtered list. +# These match at least one of the search terms above, but do not appear to be +# biology-focused. (e.g. the term "cell" appears in "Marcello"). +omit_list = [ + 'Marcello-Sega/pytim', + 'PMEAL/porespy' +] + +# -------------------------------- +# END OF USER-CONFIGURABLE OPTIONS +# -------------------------------- + +# Parse at most this many web pages. +# Parsing should automatically stop when reaching the last page. +max_page_num = 100 + +packages = True +url = ('https://github.com/{}/network/dependents' + '?dependent_type=PACKAGE').format(repo_to_query) + +package_list = [] +ghost_list = [] +prev_len = 0 +for i in range(max_page_num): + # retrieve HTML for the current URL + print("GET " + url) + r = requests.get(url) + soup = BeautifulSoup(r.content, "html.parser") + + page_package_list = [] + page_ghost_list = [] + for t in soup.findAll("div", {"class": "Box-row"}): + try: + # find repository org/name + name = "{}/{}".format( + t.find('a', {"data-repository-hovercards-enabled": ""}).text, + t.find('a', {"data-hovercard-type": "repository"}).text + ) + except AttributeError: + # Ghost repositories will give None for the find() calls above. + # This results in an AttributeError when trying to access .text + page_ghost_list.append(t.text) + continue + + # extract the number of stars + stars = 'unknown' + for span in t.find_all('span', attrs={'class': 'text-gray-light'}): + svg_star = span.find_all('svg', attrs={'class': 'octicon-star'}) + if svg_star: + # replace ","" in e.g. "1,000" before casting to int + stars = int(span.text.strip().replace(",", "")) + break + + # extract the number of forks + forks = 'unknown' + for span in t.find_all('span', attrs={'class': 'text-gray-light'}): + svg_fork = span.find_all('svg', + attrs={'class': 'octicon-repo-forked'}) + if svg_fork: + # replace ","" in e.g. "1,000" before casting to int + forks = int(span.text.strip().replace(",", "")) + break + + page_package_list.append((name, forks, stars)) + + # append packages from the current page to the overall lists + package_list = package_list + page_package_list + ghost_list = ghost_list + page_ghost_list + + # remove any duplicates + package_list = list(set(package_list)) + ghost_list = list(set(ghost_list)) + + # terminate if no change from the prior URL + new_len = len(package_list) + len(ghost_list) + if new_len == prev_len: + print("no change in package lists... stopping scraping") + break + prev_len = new_len + + # find the URL for the "Next" page of packages + paginationContainers = soup.find( + "div", {"class": "paginate-container"}).find_all('a') + url = None + for paginationContainer in paginationContainers: + # Make sure we are retrieving the "Next" page and not the "Previous" + if paginationContainer.text == "Next": + url = paginationContainer["href"] + if url is None: + print("No additional next page found, ... stopping scraping") + break + +# sort by descending number of stars +# This is the first list mentioned at the top. +all_packages = sorted(package_list, key=lambda x: x[2], reverse=True) + +# Create the second list by retaining only those with >= min_stars +# Note that in the package list, the tuple is: +# (name, # of forks, # of stars) +_popular_packages = [p for p in all_packages if p[2] >= min_stars] +n_popular = len(_popular_packages) + +# add a 4th term to each tuple, containing the GitHub topic list +popular_packages = [] + +for n, p in enumerate(_popular_packages): + print("Retrieving topics for package {} of {}".format(n + 1, n_popular)) + repo_name = p[0] + repo = g.get_repo(repo_name) + topics = repo.get_topics() + popular_packages.append(p + (topics,)) + +print("Applying filtering") +popular_filtered_packages = [] +for p in popular_packages: + name = p[0] + name_lower = name.lower() + if name in omit_list: + continue + topics = p[3] + keep = False # unless we match a term below, we will exclude the package + + # check match based on repository organization/name + for m in repo_name_terms: + if m in name_lower: + keep = True + break + + # If not already a match, search based on topic search terms + if not keep: + for topic in topics: + if topic in topic_search_terms: + keep = True + break + if keep: + popular_filtered_packages.append(p) + +# dump output lists to pickle +fname_base = repo_to_query.replace('/', '_') +if save_to_pickle: + print("Writing pickle files") + + os.chdir('/media/lee8rx/data/Dropbox/Dropbox/Grants/CZI') + with open(fname_base + '_all_packages.pickle', 'wb') as f: + pickle.dump(all_packages, f) + + with open(fname_base + '_popular_packages.pickle', 'wb') as f: + pickle.dump(popular_packages, f) + + with open(fname_base + '_popular_filtered_packages.pickle', 'wb') as f: + pickle.dump(popular_filtered_packages, f) + +if save_to_csv: + print("Writing CSV files") + df_all = pandas.DataFrame( + all_packages, + columns=('name', '# of forks', '# of stars') + ) + df_all = df_all.set_index('name') + df_all.to_csv(fname_base + '_all_dependents.csv') + + df_popular = pandas.DataFrame( + popular_packages, + columns=('name', '# of forks', '# of stars', 'topics') + ) + df_popular = df_popular.set_index('name') + df_popular.to_csv(fname_base + '_popular_dependents.csv') + + df_filtered_popular = pandas.DataFrame( + popular_filtered_packages, + columns=('name', '# of forks', '# of stars', 'topics') + ) + df_filtered_popular = df_filtered_popular.set_index('name') + df_filtered_popular.to_csv(fname_base + '_filtered_dependents.csv') + + # print(df_filtered_popular.to_markdown())