update

Metalkiler · Jul 15, 2020 · 7bf7f3a · 7bf7f3a
1 parent 7048f78
commit 7bf7f3a
Show file tree

Hide file tree

Showing 17 changed files with 102 additions and 7 deletions.
diff --git a/.idea/Cane-Categorical-Arrangement-of-Nominal-variables-Environment.iml b/.idea/Cane-Categorical-Arrangement-of-Nominal-variables-Environment.iml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/cane/build/lib/cane/__init__.py b/cane/build/lib/cane/__init__.py
@@ -80,20 +80,107 @@ def pcp(dataset=pd.DataFrame(), perc=0.05, mergeCategory="Others", n_coresJob=1,
  if columns_use is not None:
  dfFinal = pd.concat([i for i in d], axis=1)
  dfFinal.columns = columns_use
- dfFinal = pd.concat([dfFinal, TransformedData[TransformedData.columns.difference(columns_use,sort=False)]], axis=1,
+ dfFinal = pd.concat([dfFinal, TransformedData[TransformedData.columns.difference(columns_use, sort=False)]],
+ axis=1,
  sort=True)
  else:
  dfFinal = pd.concat([i for i in d], axis=1)
  dfFinal.columns = TransformedData.columns
  return dfFinal
 
 
+def pcp_multicolumn(dataset=pd.DataFrame(), perc=0.05, mergeCategory="Others",
+ columns_use=None):
+ """
+ Similarly to the normal PCP this function uses X columns given merges and applies the pcp transformation to it.
+ Next it will apply the transformation into the disaggregated columns sharing the transformation obtained previously
+
+ :param columns_use: Specific columns to apply transformation.
+ :param mergeCategory: Category for merging the data (by default "Others")
+ :param dataset: dataset to transform
+ :param perc: threshold percentage of P
+ :return: the "Dataset" transformed
+
+
+
+ """
+
+ TransformedData = dataset.copy()
+
+ assert isinstance(TransformedData, pd.DataFrame), "Dataset needs to be of type Pandas"
+ assert 0 <= perc <= 1, "Percentage goes from 0 to 1, it may neither be negative nor above 1"
+ assert (columns_use is not None), "multicolumn PCP requires the usage of columns!"
+ assert (len(columns_use) > 1), "multicolumn PCP requires the usage of more than 1 column!"
+ if isinstance(TransformedData, pd.DataFrame) and perc <= 1 and columns_use is not None:
+
+ assert all(flag in TransformedData.columns for flag in
+ columns_use), "Use columns specific to the dataset given the columns provided are not found " \
+ + ' '.join([j for j in columns_use])
+ if set(columns_use).issubset(TransformedData.columns):
+
+ mergedColumn = []
+ for column in columns_use:
+ mergedColumn.append(TransformedData[column].values)
+
+ dfTesting = pd.Series([y for x in mergedColumn for y in x], name="X")
+
+ d = __pcp_single__(dfTesting, perc_inner=perc, mergeCategoryinner=mergeCategory)
+ dic = {v: [i for i in np.unique(v)][0] for _, v in d.items()}
+ for column in columns_use:
+ TransformedData[column] = TransformedData[column].map(dic)
+ TransformedData[column] = TransformedData[column].fillna(mergeCategory) # because of others
+ # dfFinal = pd.concat([i for i in d], axis=1)
+ # dfFinal.columns = columns_use
+ # dfFinal = pd.concat([dfFinal, TransformedData[TransformedData.columns.difference(columns_use, sort=False)]],
+ # axis=1,
+ # sort=True)
+
+ return TransformedData
+
+
+def idf_multicolumn(dataset, columns_use=None):
+ """
+ The Inverse Document Frequency (IDF) uses f(x)= log(n/f_x),
+ where n is the length of x and f_x is the frequency of x.
+ Next it will apply the transformation into the disaggregated columns sharing
+ the transformation obtained previously
+
+ :param columns_use: List of columns to use
+ :param dataset: dataset to transform
+
+ :return: Dataset with the IDF transformation
+ """
+
+ TransformedData = dataset.copy()
+
+ assert isinstance(TransformedData, pd.DataFrame), "Dataset needs to be of type Pandas"
+ assert (columns_use is not None), "multicolumn idf requires the usage of columns!"
+ assert (len(columns_use) > 1), "multicolumn idf requires the usage of more than 1 column!"
+ if isinstance(TransformedData, pd.DataFrame) and columns_use is not None:
+
+ assert all(flag in TransformedData.columns for flag in
+ columns_use), "Use columns specific to the dataset given the columns provided are not found " \
+ + ' '.join([j for j in columns_use])
+ if set(columns_use).issubset(TransformedData.columns):
+
+ mergedColumn = []
+ for column in columns_use:
+ mergedColumn.append(TransformedData[column].values)
+
+ dfTesting = pd.Series([y for x in mergedColumn for y in x], name="X")
+
+ d = __idf_single_dic__(dfTesting)
+ for column in columns_use:
+ TransformedData[column] = TransformedData[column].replace(d)
+ return TransformedData
+
+
 def dic_pcp(dataset):
  """
  :param dataset: Dataset Transformed with the PCP
  :return: Dictionary with the constitution of the PCP dataset for each column value
  """
- assert isinstance(dataset, pd.DataFrame), "Dataset needs to be of type Pandas"
+ assert isinstance(dataset, pd.DataFrame) or isinstance(dataset, pd.Series), "Dataset needs to be of type Pandas"
  return {k: {i: i for i in np.unique(v)} for k, v in dataset.items()}
 
 
@@ -109,6 +196,15 @@ def __idf_single__(f):
  return resTreated
 
 
+def __idf_single_dic__(f):
+ x = f.value_counts(sort=False)
+ N = len(f)
+ idf = {}
+ for i in range(0, len(x)):
+ idf[x.index[i]] = math.log(N / x.values[i])
+ return idf
+
+
 def idf(dataset, n_coresJob=1, disableLoadBar=True, columns_use=None):
  """
  The Inverse Document Frequency (IDF) uses f(x)= log(n/f_x),

diff --git a/cane/cane.egg-info/PKG-INFO b/cane/cane.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cane
-Version: 0.0.1.7.7b1
+Version: 0.0.1.7.7
 Summary: Cane - Categorical Attribute traNsformation Environment
 Home-page: https://github.com/Metalkiler/Cane-Categorical-Attribute-traNsformation-Environment
 Author: Luís Miguel Matos, Paulo Cortez, Rui Mendes
@@ -31,7 +31,6 @@ Description: # Cane - Categorical Attribute traNsformation Environment
 
  It is possible to apply these transformations to specific columns only instead of the full dataset (follow the example).
 
- There is a new function called multicolumn (for PCP and IDF only). This function will aggregate 2 or more columns into a single one and apply the transformation to it. Afterwards it will map the transformation obtained into the disaggregated columns.
 
  # Installation
 

diff --git a/cane/dist/cane-0.0.1.6-py3-none-any.whl b/cane/dist/cane-0.0.1.6-py3-none-any.whl
diff --git a/cane/dist/cane-0.0.1.6.tar.gz b/cane/dist/cane-0.0.1.6.tar.gz
diff --git a/cane/dist/cane-0.0.1.7.4-py3-none-any.whl b/cane/dist/cane-0.0.1.7.4-py3-none-any.whl
diff --git a/cane/dist/cane-0.0.1.7.4.tar.gz b/cane/dist/cane-0.0.1.7.4.tar.gz
diff --git a/cane/dist/cane-0.0.1.7.6-py3-none-any.whl b/cane/dist/cane-0.0.1.7.6-py3-none-any.whl
diff --git a/cane/dist/cane-0.0.1.7.6.tar.gz b/cane/dist/cane-0.0.1.7.6.tar.gz
diff --git a/cane/dist/cane-0.0.1.7.7-py3-none-any.whl b/cane/dist/cane-0.0.1.7.7-py3-none-any.whl
diff --git a/cane/dist/cane-0.0.1.7.7.tar.gz b/cane/dist/cane-0.0.1.7.7.tar.gz
diff --git a/cane/dist/cane-0.0.1.7.7b1-py3-none-any.whl b/cane/dist/cane-0.0.1.7.7b1-py3-none-any.whl
diff --git a/cane/dist/cane-0.0.1.7.7b1.tar.gz b/cane/dist/cane-0.0.1.7.7b1.tar.gz
diff --git a/cane/dist/cane-0.0.1.7.7b3-py3-none-any.whl b/cane/dist/cane-0.0.1.7.7b3-py3-none-any.whl
diff --git a/cane/dist/cane-0.0.1.7.7b3.tar.gz b/cane/dist/cane-0.0.1.7.7b3.tar.gz
diff --git a/cane/setup.py b/cane/setup.py
@@ -8,7 +8,7 @@
 
 
 setuptools.setup(name='cane',
- version='0.0.1.7.7beta1',
+ version='0.0.1.7.7',
  description='Cane - Categorical Attribute traNsformation Environment',
  author='Luís Miguel Matos, Paulo Cortez, Rui Mendes',
  license='MIT',