-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
67 lines (56 loc) · 2.48 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# This file extracts data from a remote repository in GitHub and creates a .csv file with
# any valuable leaks the file contains
import git
import pandas as pd
import re
import os
import sys
from tqdm import tqdm
from IPython.display import display
pd.set_option('display.max_rows', 15)
pd.set_option('display.max_columns', 5)
pd.set_option('display.width', 1000000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
def extract(url: str, **kwargs) -> git.Repo:
if os.path.isdir('skale-manager-HTTPS'):
print('Extracting repo from local dir: {}'.format('skale-manager-HTTPS'))
return git.Repo('skale-manager-HTTPS', search_parent_directories=True)
print('Extracting remote repo: {}'.format(url))
if 'branch' in kwargs.items():
repo = git.Repo.clone_from(url=url, to_path='skale-manager-HTTPS', branch=kwargs.get('branch'))
else:
repo = git.Repo.clone_from(url=url, to_path='skale-manager-HTTPS')
print('Finished extracting remote repository: {}'.format(url))
return repo
def transform(repo: git.Repo, keys: list) -> pd.DataFrame():
print('Leaking data:')
dataframe = pd.DataFrame(columns=['author', 'date', 'message'])
patterns = re.compile("|".join(keys), re.UNICODE)
with tqdm(total=len(list(repo.iter_commits())), desc='Transforming commits...',
bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') as pbar:
for commit in repo.iter_commits():
if patterns.search(commit.message, re.IGNORECASE):
dataframe.loc[len(dataframe)] = [commit.author, commit.committed_date, commit.message]
pbar.update(1)
dataframe['author'] = dataframe['author'].apply(str)
dataframe['message'] = dataframe['message'].apply(str)
print('Finished leaking data')
return dataframe
def load(dataframe: pd.DataFrame):
print('Creating leaks.csv:')
dataframe.to_csv('leaks.csv')
print('Creating leaks.json:')
dataframe.to_json('leaks.json', orient='records', indent=2)
return display(dataframe)
def main():
try:
url = 'https://github.com/skalenetwork/skale-manager'
repository = extract(url=url)
dataframe = transform(repo=repository, keys=['password', 'key'])
load(dataframe=dataframe)
print('Finished loading leaks')
except KeyboardInterrupt:
print("Forced exit: Exiting program...")
if __name__ == '__main__':
sys.exit(main())