-
Notifications
You must be signed in to change notification settings - Fork 1
/
ClassifyingDocumentsUsingDistributionOfTermUniqueness.cs
236 lines (208 loc) · 10.1 KB
/
ClassifyingDocumentsUsingDistributionOfTermUniqueness.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
// Exploring using the distribution of unique/common terms to classify different kinds of documents.
// Sample data is from the listings file from the Sortable code challenge. I am interesting in ways
// to classify a listing a product or accessory for a post match pruning stage.
//
// Assuming:
// 1) A "typical" camera listing has one model number (fairly unique) and many non unique terms.
// 2) Accessories that can be used for multiple models have many unique terms.
// It should be possible to decide between a product and a product accessory by examining the distribution of term probabilities.
//
// Observations:
//
// 1) Had use a non linear scale (I used powers of 2) for the histogram buckets otherwise all the unique (small probability) words ended up in the same bucket.
//
// 2) It does seem to be possible to identify accessory listings that have a high ratio of unique terms.
//
// [8] 310 digital camera video mask now rated to 65 feet
// 0 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 2 | 0 | 0 | 1 | 0 | 3 |
// [119] optekas extreme travelers essentials kit by opteka package inlcudes excursion series c900 fullsize waterproof canvas bag 6501300mm and 500mm telephoto lenses heavy duty tripod and monopod and much more for pentax k10d k20d k100d k110d k200d ist digital slr cameras
// 0 | 1 | 0 | 5 | 1 | 2 | 5 | 1 | 4 | 1 | 9 | 3 | 5 | 3 | 0 |
// [434] xvision ssl150 camera floodlight 150w
// 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 2 |
// [135] casecrown camo rugged canvas messenger bag for the canon digital rebel xsi 122 mp digital slr camera
// 0 | 3 | 1 | 2 | 0 | 1 | 2 | 1 | 2 | 0 | 0 | 1 | 1 | 2 | 1 |
//
// 3) High rate of false positives when listings are in other languages. There are not many listings in other languages so every word in these listings gets marked as unique.
// It may be possible to figure out what language the listings are by using identifying listings which have unusual character n-grams distributions but I don't
// think there will be enough text per listing to do this reliably.
//
// [578] jendigital jd 5200 z3 digitalkamera 50 2560 x 1920 32mb
// 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 2 | 4 | 0 | 1 | 0 | 0 |
// [837] samsung nx100 noir optique ifn 2050mm flash ng15
// 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 2 | 0 | 0 |
namespace ClassifyingDocumentsUsingDistributionOfTermUniqueness
{
public class ScoredListing
{
public string Listing { get; set; }
public List<double> Probablities { get; set; }
}
public class Experiment
{
public static IEnumerable<string> Ingest(string fileName)
{
using (var file = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, true))
{
using (var reader = new StreamReader(file))
{
while (reader.Peek() >= 0)
{
var line = reader.ReadLine();
// Parse listing from file
// "title\":\"LED Flash Macro Ring Light (48 X LED) with 6 Adapter Rings for For Canon/Sony/Nikon/Sigma Lenses\",\"manufacturer\"
var titleIdx = line.IndexOf(@"title", StringComparison.InvariantCultureIgnoreCase);
if (titleIdx < 0)
continue;
const int TITLE_LEFT_OFFSET = 8;
var manufactuerIndex = line.IndexOf(@"manufacturer", titleIdx + TITLE_LEFT_OFFSET, StringComparison.InvariantCultureIgnoreCase);
if (manufactuerIndex < 0)
continue;
const int TITLE_RIGHT_OFFSET = -13;
var listing = line.Substring(titleIdx + TITLE_LEFT_OFFSET, manufactuerIndex + TITLE_RIGHT_OFFSET);
yield return listing;
}
}
}
}
/// <summary>
/// Remove punctuation, digits, signs, double spaces, line end and lowercase everything
/// </summary>
private static string Munge(string original)
{
var temp = original.ToCharArray();
var result = new List<char>(original.Length);
bool lastCharWasSpace = false;
foreach (var c in temp)
{
if (char.IsLetter(c) || char.IsNumber(c))
{
result.Add(char.ToLower(c));
lastCharWasSpace = false;
}
else if ((char.IsWhiteSpace(c) || c == '/') && !lastCharWasSpace)
{
// Also takes care of \r\n
result.Add(' ');
lastCharWasSpace = true;
}
}
return new string(result.ToArray());
}
private static int NUM_BUCKETS = 15;
public static IList<double> GenerateHistogramAxis()
{
var histogram = Enumerable.Repeat(0D, NUM_BUCKETS).ToList();
// Most of the term probabilities are very small so we use buckets with increasingly smaller range.
var prob = 1D; // 100%
for (var i = 0; i < histogram.Count; i++)
{
// Each bucket is 1/2 range of previous.
prob /= 2; // 0.5, 0.25, 0.0125, ...
histogram[i] = prob;
}
return histogram;
}
public static IList<int> GenerateHistogramBuckets()
{
return Enumerable.Repeat(0, NUM_BUCKETS).ToList(); // Make some buckets
}
public static void Run()
{
var listings = Ingest("listings.txt").Take(99999).ToList();
var munged = listings.Select(Munge);
// Note: This is exploratory code so it isn't the greatest.
// Get term frequency over all listings
var freqByToken = new Dictionary<string, int>();
var totalTokens = 0;
foreach (var token in munged.SelectMany(x => x.Split(null)))
{
if (!freqByToken.ContainsKey(token))
freqByToken.Add(token, 0);
freqByToken[token]++;
totalTokens++;
}
DisplaySomeCommonWords(freqByToken);
DisplaySomeUniqueWords(freqByToken);
// Assign a probability for each term in the listings.
var scored = new List<ScoredListing>();
var maxProbability = 0D;
foreach (var listing in munged)
{
var probablities = new List<double>();
foreach (var token in listing.Split(null))
{
probablities.Add((double)freqByToken[token] / totalTokens);
}
if (probablities.Max() > maxProbability) { maxProbability = probablities.Max(); }
scored.Add(new ScoredListing { Listing = listing, Probablities = probablities });
}
var axisValues = GenerateHistogramAxis();
DisplayHistogramAxis(axisValues);
int count = 0;
foreach (var listing in scored)
{
// Assign this listing's term probabilities to a histogram bucket
var histogram = GenerateHistogramBuckets();
Debug.Assert(histogram.Count == axisValues.Count);
for(var i = 0; i < listing.Probablities.Count; i++)
{
// We are normalizing against all other docs. This may not be a good idea.
listing.Probablities[i] = listing.Probablities[i] / maxProbability;
// Try each bucket until probability fits in range
for (var j = 0; j < histogram.Count; j++)
{
if (axisValues[j] <= listing.Probablities[i])
{
histogram[(int)j]++;
break;
}
}
}
// These are just made up numbers to try classifying some listings
var numTerms = histogram.Sum();
var numUnique = histogram.Skip(10).Sum();
var ratioUnique = (double)numUnique / numTerms;
var numCommon = histogram.Take(5).Sum();
var ratioCommon = (double)numCommon / numTerms;
if (numUnique > 3 && ratioCommon > 0.05D && ratioUnique > 0.25D)
{
// Show possible accessory listings.
Console.WriteLine("[{0}] {1}", count++, listing.Listing);
Console.WriteLine(String.Concat(histogram.Select(x => x.ToString() + " | ")));
}
}
}
private static void DisplayHistogramAxis(IList<double> axisValues)
{
Console.WriteLine("------------------------------- Axis");
Console.WriteLine(String.Concat(axisValues.Select(x => x.ToString() + " | ")));
Console.WriteLine("-------------------------------");
}
private static void DisplaySomeUniqueWords(Dictionary<string, int> freqByToken)
{
Console.WriteLine("------------------------------- Examples of unique words");
var unique = freqByToken.OrderBy(x => x.Value).Take(15).Select(x => x.Key).ToList();
foreach (var x in unique) { Console.WriteLine(x); }
Console.WriteLine("-------------------------------");
}
private static void DisplaySomeCommonWords(Dictionary<string, int> freqByToken)
{
Console.WriteLine("------------------------------- Examples of common words");
var common = freqByToken.OrderByDescending(x => x.Value).Take(15).Select(x => x.Key).ToList();
foreach (var x in common) { Console.WriteLine(x); }
Console.WriteLine("-------------------------------");
}
}
public class Program
{
public static void Main(string[] args)
{
Experiment.Run();
}
}
}