Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Explore using log-likelihood instead of TF-IDF #22

Open
chapmanjacobd opened this issue Dec 6, 2023 · 3 comments
Open

Explore using log-likelihood instead of TF-IDF #22

chapmanjacobd opened this issue Dec 6, 2023 · 3 comments

Comments

@chapmanjacobd
Copy link
Owner

Is your feature request related to a problem? Please describe.

This is a solution looking for a problem but the results might be interesting.

Describe the solution you'd like

https://wordhoard.northwestern.edu/userman/analysis-comparewords.html#loglike

Describe alternatives you've considered

I'm not an expert in this space so any other algorithms, suggestions, or explorations are welcome!

@chapmanjacobd
Copy link
Owner Author

chapmanjacobd commented Jan 25, 2024

similarly, experiment with other libraries like gensim, etc:

def find_clusters_gensim(n_clusters, sentence_strings):
    from gensim import corpora
    from gensim.models import LdaModel
    from gensim.parsing.preprocessing import STOPWORDS
    from gensim.utils import simple_preprocess
    import numpy as np

    def preprocess(text):
        result = []
        for token in simple_preprocess(text, max_len=32):
            if token not in STOPWORDS:
                result.append(token)
        return result

    processed_docs = [preprocess(doc) for doc in sentence_strings]
    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    lda_model = LdaModel(
        corpus,
        num_topics=n_clusters or int(np.sqrt(len(corpus))),
        id2word=dictionary,
        passes=10,
        random_state=0,
    )

    clusters = [max(lda_model[doc], key=lambda item: item[1])[0] for doc in corpus if doc]
    return clusters
def find_clusters_fasttext(n_clusters, sentence_strings):
    from gensim.models import FastText
    from sklearn.cluster import KMeans
    import numpy as np

    def preprocess(text):
        return text.lower().split()
    tokenized_sentences = [preprocess(sentence) for sentence in sentence_strings]

    fasttext_model = FastText(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, epochs=10)
    def sentence_embedding(sentence):
        embeddings = [fasttext_model.wv[word] for word in sentence if word in fasttext_model.wv]
        if embeddings:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(fasttext_model.vector_size)
    sentence_embeddings = np.array([sentence_embedding(sentence) for sentence in tokenized_sentences])

    n_clusters = n_clusters or int(np.sqrt(len(tokenized_sentences)))
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=10).fit(sentence_embeddings)

    return kmeans.labels_

@chapmanjacobd
Copy link
Owner Author

I tried these:


def find_clusters_gensim(n_clusters, sentence_strings):
    from gensim import corpora, matutils
    from gensim.models import LsiModel
    import numpy as np
    from sklearn.cluster import KMeans

    processed_docs = [s.split() for s in sentence_strings]
    n_clusters = n_clusters or int(np.sqrt(len(processed_docs)))

    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    lsi_model = LsiModel(
        corpus,
        num_topics=n_clusters,
        id2word=dictionary,
    )

    # Transform the corpus to LSI space
    corpus_lsi = lsi_model[corpus]
    lsi_matrix = matutils.corpus2dense(corpus_lsi, num_docs=len(processed_docs), num_terms=n_clusters).T

    kmeans_model = KMeans(n_clusters=n_clusters, random_state=0, n_init=10)
    kmeans_model.fit(lsi_matrix)
    return kmeans_model.labels_
def find_clusters_gensim2(n_clusters, sentence_strings):
    from gensim import corpora
    from gensim.models import TfidfModel
    from sklearn.cluster import KMeans
    from gensim.matutils import corpus2dense

    tokenized_sentences = [s.split() for s in sentence_strings]
    n_clusters = n_clusters or int(np.sqrt(len(tokenized_sentences)))

    dictionary = corpora.Dictionary(tokenized_sentences)
    corpus = [dictionary.doc2bow(text) for text in tokenized_sentences]

    tfidf = TfidfModel(corpus, normalize=True)
    corpus_tfidf = tfidf[corpus]

    X = corpus2dense(corpus_tfidf, num_terms=len(dictionary.token2id), num_docs=len(tokenized_sentences)).T

    clusterizer = KMeans(n_clusters=n_clusters or int(X.shape[0] ** 0.5), random_state=0, n_init=10).fit(X)
    return clusterizer.labels_

but they seem slower and lower quality

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant