Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
Metalkiler committed Jul 15, 2020
1 parent 7048f78 commit 7bf7f3a
Show file tree
Hide file tree
Showing 17 changed files with 102 additions and 7 deletions.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

100 changes: 98 additions & 2 deletions cane/build/lib/cane/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,20 +80,107 @@ def pcp(dataset=pd.DataFrame(), perc=0.05, mergeCategory="Others", n_coresJob=1,
if columns_use is not None:
dfFinal = pd.concat([i for i in d], axis=1)
dfFinal.columns = columns_use
dfFinal = pd.concat([dfFinal, TransformedData[TransformedData.columns.difference(columns_use,sort=False)]], axis=1,
dfFinal = pd.concat([dfFinal, TransformedData[TransformedData.columns.difference(columns_use, sort=False)]],
axis=1,
sort=True)
else:
dfFinal = pd.concat([i for i in d], axis=1)
dfFinal.columns = TransformedData.columns
return dfFinal


def pcp_multicolumn(dataset=pd.DataFrame(), perc=0.05, mergeCategory="Others",
columns_use=None):
"""
Similarly to the normal PCP this function uses X columns given merges and applies the pcp transformation to it.
Next it will apply the transformation into the disaggregated columns sharing the transformation obtained previously
:param columns_use: Specific columns to apply transformation.
:param mergeCategory: Category for merging the data (by default "Others")
:param dataset: dataset to transform
:param perc: threshold percentage of P
:return: the "Dataset" transformed
"""

TransformedData = dataset.copy()

assert isinstance(TransformedData, pd.DataFrame), "Dataset needs to be of type Pandas"
assert 0 <= perc <= 1, "Percentage goes from 0 to 1, it may neither be negative nor above 1"
assert (columns_use is not None), "multicolumn PCP requires the usage of columns!"
assert (len(columns_use) > 1), "multicolumn PCP requires the usage of more than 1 column!"
if isinstance(TransformedData, pd.DataFrame) and perc <= 1 and columns_use is not None:

assert all(flag in TransformedData.columns for flag in
columns_use), "Use columns specific to the dataset given the columns provided are not found " \
+ ' '.join([j for j in columns_use])
if set(columns_use).issubset(TransformedData.columns):

mergedColumn = []
for column in columns_use:
mergedColumn.append(TransformedData[column].values)

dfTesting = pd.Series([y for x in mergedColumn for y in x], name="X")

d = __pcp_single__(dfTesting, perc_inner=perc, mergeCategoryinner=mergeCategory)
dic = {v: [i for i in np.unique(v)][0] for _, v in d.items()}
for column in columns_use:
TransformedData[column] = TransformedData[column].map(dic)
TransformedData[column] = TransformedData[column].fillna(mergeCategory) # because of others
# dfFinal = pd.concat([i for i in d], axis=1)
# dfFinal.columns = columns_use
# dfFinal = pd.concat([dfFinal, TransformedData[TransformedData.columns.difference(columns_use, sort=False)]],
# axis=1,
# sort=True)

return TransformedData


def idf_multicolumn(dataset, columns_use=None):
"""
The Inverse Document Frequency (IDF) uses f(x)= log(n/f_x),
where n is the length of x and f_x is the frequency of x.
Next it will apply the transformation into the disaggregated columns sharing
the transformation obtained previously
:param columns_use: List of columns to use
:param dataset: dataset to transform
:return: Dataset with the IDF transformation
"""

TransformedData = dataset.copy()

assert isinstance(TransformedData, pd.DataFrame), "Dataset needs to be of type Pandas"
assert (columns_use is not None), "multicolumn idf requires the usage of columns!"
assert (len(columns_use) > 1), "multicolumn idf requires the usage of more than 1 column!"
if isinstance(TransformedData, pd.DataFrame) and columns_use is not None:

assert all(flag in TransformedData.columns for flag in
columns_use), "Use columns specific to the dataset given the columns provided are not found " \
+ ' '.join([j for j in columns_use])
if set(columns_use).issubset(TransformedData.columns):

mergedColumn = []
for column in columns_use:
mergedColumn.append(TransformedData[column].values)

dfTesting = pd.Series([y for x in mergedColumn for y in x], name="X")

d = __idf_single_dic__(dfTesting)
for column in columns_use:
TransformedData[column] = TransformedData[column].replace(d)
return TransformedData


def dic_pcp(dataset):
"""
:param dataset: Dataset Transformed with the PCP
:return: Dictionary with the constitution of the PCP dataset for each column value
"""
assert isinstance(dataset, pd.DataFrame), "Dataset needs to be of type Pandas"
assert isinstance(dataset, pd.DataFrame) or isinstance(dataset, pd.Series), "Dataset needs to be of type Pandas"
return {k: {i: i for i in np.unique(v)} for k, v in dataset.items()}


Expand All @@ -109,6 +196,15 @@ def __idf_single__(f):
return resTreated


def __idf_single_dic__(f):
x = f.value_counts(sort=False)
N = len(f)
idf = {}
for i in range(0, len(x)):
idf[x.index[i]] = math.log(N / x.values[i])
return idf


def idf(dataset, n_coresJob=1, disableLoadBar=True, columns_use=None):
"""
The Inverse Document Frequency (IDF) uses f(x)= log(n/f_x),
Expand Down
3 changes: 1 addition & 2 deletions cane/cane.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: cane
Version: 0.0.1.7.7b1
Version: 0.0.1.7.7
Summary: Cane - Categorical Attribute traNsformation Environment
Home-page: https://github.com/Metalkiler/Cane-Categorical-Attribute-traNsformation-Environment
Author: Luís Miguel Matos, Paulo Cortez, Rui Mendes
Expand Down Expand Up @@ -31,7 +31,6 @@ Description: # Cane - Categorical Attribute traNsformation Environment

It is possible to apply these transformations to specific columns only instead of the full dataset (follow the example).

There is a new function called multicolumn (for PCP and IDF only). This function will aggregate 2 or more columns into a single one and apply the transformation to it. Afterwards it will map the transformation obtained into the disaggregated columns.

# Installation

Expand Down
Binary file removed cane/dist/cane-0.0.1.6-py3-none-any.whl
Binary file not shown.
Binary file removed cane/dist/cane-0.0.1.6.tar.gz
Binary file not shown.
Binary file removed cane/dist/cane-0.0.1.7.4-py3-none-any.whl
Binary file not shown.
Binary file removed cane/dist/cane-0.0.1.7.4.tar.gz
Binary file not shown.
Binary file removed cane/dist/cane-0.0.1.7.6-py3-none-any.whl
Binary file not shown.
Binary file removed cane/dist/cane-0.0.1.7.6.tar.gz
Binary file not shown.
Binary file added cane/dist/cane-0.0.1.7.7-py3-none-any.whl
Binary file not shown.
Binary file added cane/dist/cane-0.0.1.7.7.tar.gz
Binary file not shown.
Binary file removed cane/dist/cane-0.0.1.7.7b1-py3-none-any.whl
Binary file not shown.
Binary file removed cane/dist/cane-0.0.1.7.7b1.tar.gz
Binary file not shown.
Binary file added cane/dist/cane-0.0.1.7.7b3-py3-none-any.whl
Binary file not shown.
Binary file added cane/dist/cane-0.0.1.7.7b3.tar.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion cane/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


setuptools.setup(name='cane',
version='0.0.1.7.7beta1',
version='0.0.1.7.7',
description='Cane - Categorical Attribute traNsformation Environment',
author='Luís Miguel Matos, Paulo Cortez, Rui Mendes',
license='MIT',
Expand Down

0 comments on commit 7bf7f3a

Please sign in to comment.