-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare.py
122 lines (95 loc) · 3.83 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import numpy as np
import os
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
def basic_clean(df, col):
'''
This function takes in a df and a string for a column and
returns the df with a new column named 'basic_clean' with the
passed column text normalized.
'''
df['basic_clean'] = df[col].str.lower()\
.replace(r'[^\w\s]', '', regex=True)\
.str.normalize('NFKC')\
.str.encode('ascii', 'ignore')\
.str.decode('utf-8', 'ignore')
return df
def tokenize(df, col):
'''
This function takes in a df and a string for a column and
returns a df with a new column named 'clean_tokes' with the
passed column text tokenized and in a list.
'''
tokenizer = nltk.tokenize.ToktokTokenizer()
df['clean_tokes'] = df[col].apply(tokenizer.tokenize)
return df
def stem(df, col):
'''
This function takes in a df and a string for a column name and
returns a df with a new column named 'stemmed'.
'''
# Create porter stemmer
ps = nltk.porter.PorterStemmer()
# Stem each token from our clean_tokes Series of lists
stems = df[col].apply(lambda row: [ps.stem(word) for word in row])
# Join our cleaned, stemmed lists of words back into sentences
df['stemmed'] = stems.str.join(' ')
return df
def lemmatize(df, col):
'''
This function takes in a df and a string for column name and
returns the original df with a new column called 'lemmatized'.
'''
# Create the lemmatizer
wnl = nltk.stem.WordNetLemmatizer()
# Lemmatize each token from our clean_tokes Series of lists
lemmas = df[col].apply(lambda row: [wnl.lemmatize(word) for word in row])
# Join the cleaned and lemmatized tokens back into sentences
df['lemmatized'] = lemmas.str.join(' ')
return df
def remove_stopwords(df, col, extra_words=[]):
'''
This function takes in a df and a string for column name, optional extra_words parameter
if you want to add extra stopwords and returns the df with a new column
named 'clean' with stopwords removed.
'''
# Create stopword_list
stopword_list = stopwords.words('english')
# Add optional additional stopwords
stopword_list.extend(extra_words)
# Split words in column
words = df[col].str.split()
# Check each word in each row of the column against stopword_list and return only those that are not in list
filtered_words = words.apply(lambda row: [word for word in row if word not in stopword_list])
# Create new column of words that have stopwords removed
df['clean_' + col] = filtered_words.str.join(' ')
return df
def prep_data(df, col):
'''
This function takes in a df and string column name
returns the df with original columns plus cleaned
and lemmatized text column without stopwords.
'''
# Do basic clean on article content
df = basic_clean(df, col)
# Tokenize clean article content
df = tokenize(df, 'basic_clean')
# Lemmatize cleaned and tokenized article content
df = lemmatize(df, 'clean_tokes')
# Remove stopwords from Lemmatized article content
df = remove_stopwords(df, 'lemmatized')
return df[['repo', 'language', 'readme_contents', 'clean_lemmatized']]
def split(df, stratify_by=None):
"""
3 way split for train, validate, and test datasets
To stratify, send in a column name
"""
train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])
return train, validate, test