From 3964eecc4cf576035afcca635efc1a41f73f0bdd Mon Sep 17 00:00:00 2001
From: Jonne <jonne@jonnesaleva.com>
Date: Wed, 27 May 2020 22:25:42 -0400
Subject: first clustering experiments -> seems to capture orthography, not so
 much etymological variation. maybe bigrams too short?

---
 ...ymological-kmeans-clustering-results-052720.pkl | Bin 0 -> 14311382 bytes
 .../etymological_clustering.cpython-37.pyc         | Bin 0 -> 1836 bytes
 src/experimental/etymological_clustering.py        | 124 +++++++++++++++++++++
 3 files changed, 124 insertions(+)
 create mode 100644 data/etymological-kmeans-clustering-results-052720.pkl
 create mode 100644 src/experimental/__pycache__/etymological_clustering.cpython-37.pyc
 create mode 100644 src/experimental/etymological_clustering.py

diff --git a/data/etymological-kmeans-clustering-results-052720.pkl b/data/etymological-kmeans-clustering-results-052720.pkl
new file mode 100644
index 0000000..efee4a9
Binary files /dev/null and b/data/etymological-kmeans-clustering-results-052720.pkl differ
diff --git a/src/experimental/__pycache__/etymological_clustering.cpython-37.pyc b/src/experimental/__pycache__/etymological_clustering.cpython-37.pyc
new file mode 100644
index 0000000..b6940fd
Binary files /dev/null and b/src/experimental/__pycache__/etymological_clustering.cpython-37.pyc differ
diff --git a/src/experimental/etymological_clustering.py b/src/experimental/etymological_clustering.py
new file mode 100644
index 0000000..13596cc
--- /dev/null
+++ b/src/experimental/etymological_clustering.py
@@ -0,0 +1,124 @@
+"""
+Etymological clustering of Yiddish nouns.
+
+Idea: dimensionality-reduce the ngram count / tfidf matrix
+      using a variety of methods & cluster the reduced words.
+
+Variables to experiment over:
+    - CV / TFIDF as count matrix
+    - n-gram range: 1, 2, 3
+    - Dim Reduction algo: TruncatedSVD, LLE, T-SNE, spectral, Isomap, MDS
+    - Dimensionality to reduce down to: 2, 5, 10, 15, 20
+    - Clustering algorithm: [kmeans, spectral]
+    - Number of clusters: 2, 3
+
+(c) Jonne Saleva, 2020
+"""
+
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.decomposition import TruncatedSVD
+from sklearn.manifold import Isomap, TSNE, MDS, LocallyLinearEmbedding, SpectralEmbedding
+from sklearn.cluster import KMeans, SpectralClustering
+import itertools as it
+import pandas as pd 
+import numpy as np
+
+# helpers
+def flatten(nested):
+    return [x for x in it.chain.from_iterable(nested)]
+
+def get_featurizer(tf, all_words, nr):
+    if tf == 'cv':
+        feat = CountVectorizer(analyzer='char', ngram_range=(1, nr))
+    elif tf == 'tfidf':
+        feat = TfidfVectorizer(analyzer='char', ngram_range=(1,nr))
+    else:
+        raise ValueError('Invalid featurizer type')
+
+    #words = np.array(all_words).reshape(-1, 1)
+    return feat
+
+def get_dim_reducer(dr, X, d):
+
+    inventory = {
+        'svd': TruncatedSVD, 
+        'lle': LocallyLinearEmbedding, 
+        'tsne': TSNE, 
+        'spectral': SpectralEmbedding, 
+        'isomap': Isomap, 
+        'mds': MDS
+    }
+    reducer_cls = inventory[dr]
+    reducer = reducer_cls(n_components=d)
+    return reducer
+
+def get_clustering(c, X, k):
+
+    inventory = {'kmeans': KMeans,'spectral': SpectralClustering}
+    clustering_cls = inventory[c]
+    clustering = clustering_cls(n_clusters=k)
+    return clustering
+
+# generate experiments
+text_featurizers = ['cv', 'tfidf']
+dim_reducers = ['svd', 'lle', 'spectral'] # tsne, isomap, mds
+clusterings = ['kmeans', 'spectral']
+ngram_ranges = [1, 2] # 3
+Ds = [2, 5, 10]
+Ks = [2,3]
+
+experiments = it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks)
+
+# load data
+corpus = pd.read_csv('../../data/corpus.csv')
+n_words = corpus.shape[0]
+all_words = flatten([corpus[c].tolist() for c in corpus.columns])
+orthographies ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words
+
+# perform experiments
+outputs = []
+for ix, (tf, dr, c, nr, d, k) in enumerate(experiments):
+
+    print('===============================')
+    print(f'Beginning experiment {ix+1}...')
+    print(f'Text featurization: {tf}')
+    print(f'N-gram range: (1, {nr})')
+    print(f'Dim reduction: {dr}')
+    print(f'Latent dimension: {d}')
+    print(f'Clustering algorithm: {c}')
+    print(f'No. of clusters: {k}')
+
+    if k == 2:
+        print('\t=> we are doing Hebrew identification')
+
+
+    # get the word-feature matrix
+    featurizer = get_featurizer(tf, all_words, nr)
+    X = featurizer.fit_transform(all_words)
+    if not isinstance(X, np.ndarray):
+        X = X.toarray()
+
+    # dimensionality-reduce it
+    reducer = get_dim_reducer(dr, X, d)
+    X_reduced = reducer.fit_transform(X)
+
+    # cluster it
+    clustering = get_clustering(c, X_reduced, k)
+    predicted_clusters = clustering.fit_predict(X_reduced)
+
+    output = pd.DataFrame()
+    output['word'] = all_words
+    output['predicted_cluster'] = predicted_clusters
+    output['featurization'] = tf
+    output['dim_reduction'] = dr
+    output['latent_dim'] = d
+    output['clustering'] = c
+    output['max_ngram'] = nr
+    output['n_clusters'] = k
+    output['experiment_ix'] = ix + 1
+
+    outputs.append(output)
+
+output_df = pd.concat(outputs, ignore_index=True)
+output_df.to_pickle('../../data/etymological-clustering-results-052820.pkl')
+
-- 
cgit 1.4.1-2-gfad0