From 3964eecc4cf576035afcca635efc1a41f73f0bdd Mon Sep 17 00:00:00 2001 From: Jonne Date: Wed, 27 May 2020 22:25:42 -0400 Subject: first clustering experiments -> seems to capture orthography, not so much etymological variation. maybe bigrams too short? --- ...ymological-kmeans-clustering-results-052720.pkl | Bin 0 -> 14311382 bytes .../etymological_clustering.cpython-37.pyc | Bin 0 -> 1836 bytes src/experimental/etymological_clustering.py | 124 +++++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 data/etymological-kmeans-clustering-results-052720.pkl create mode 100644 src/experimental/__pycache__/etymological_clustering.cpython-37.pyc create mode 100644 src/experimental/etymological_clustering.py diff --git a/data/etymological-kmeans-clustering-results-052720.pkl b/data/etymological-kmeans-clustering-results-052720.pkl new file mode 100644 index 0000000..efee4a9 Binary files /dev/null and b/data/etymological-kmeans-clustering-results-052720.pkl differ diff --git a/src/experimental/__pycache__/etymological_clustering.cpython-37.pyc b/src/experimental/__pycache__/etymological_clustering.cpython-37.pyc new file mode 100644 index 0000000..b6940fd Binary files /dev/null and b/src/experimental/__pycache__/etymological_clustering.cpython-37.pyc differ diff --git a/src/experimental/etymological_clustering.py b/src/experimental/etymological_clustering.py new file mode 100644 index 0000000..13596cc --- /dev/null +++ b/src/experimental/etymological_clustering.py @@ -0,0 +1,124 @@ +""" +Etymological clustering of Yiddish nouns. + +Idea: dimensionality-reduce the ngram count / tfidf matrix + using a variety of methods & cluster the reduced words. + +Variables to experiment over: + - CV / TFIDF as count matrix + - n-gram range: 1, 2, 3 + - Dim Reduction algo: TruncatedSVD, LLE, T-SNE, spectral, Isomap, MDS + - Dimensionality to reduce down to: 2, 5, 10, 15, 20 + - Clustering algorithm: [kmeans, spectral] + - Number of clusters: 2, 3 + +(c) Jonne Saleva, 2020 +""" + +from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer +from sklearn.decomposition import TruncatedSVD +from sklearn.manifold import Isomap, TSNE, MDS, LocallyLinearEmbedding, SpectralEmbedding +from sklearn.cluster import KMeans, SpectralClustering +import itertools as it +import pandas as pd +import numpy as np + +# helpers +def flatten(nested): + return [x for x in it.chain.from_iterable(nested)] + +def get_featurizer(tf, all_words, nr): + if tf == 'cv': + feat = CountVectorizer(analyzer='char', ngram_range=(1, nr)) + elif tf == 'tfidf': + feat = TfidfVectorizer(analyzer='char', ngram_range=(1,nr)) + else: + raise ValueError('Invalid featurizer type') + + #words = np.array(all_words).reshape(-1, 1) + return feat + +def get_dim_reducer(dr, X, d): + + inventory = { + 'svd': TruncatedSVD, + 'lle': LocallyLinearEmbedding, + 'tsne': TSNE, + 'spectral': SpectralEmbedding, + 'isomap': Isomap, + 'mds': MDS + } + reducer_cls = inventory[dr] + reducer = reducer_cls(n_components=d) + return reducer + +def get_clustering(c, X, k): + + inventory = {'kmeans': KMeans,'spectral': SpectralClustering} + clustering_cls = inventory[c] + clustering = clustering_cls(n_clusters=k) + return clustering + +# generate experiments +text_featurizers = ['cv', 'tfidf'] +dim_reducers = ['svd', 'lle', 'spectral'] # tsne, isomap, mds +clusterings = ['kmeans', 'spectral'] +ngram_ranges = [1, 2] # 3 +Ds = [2, 5, 10] +Ks = [2,3] + +experiments = it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks) + +# load data +corpus = pd.read_csv('../../data/corpus.csv') +n_words = corpus.shape[0] +all_words = flatten([corpus[c].tolist() for c in corpus.columns]) +orthographies ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words + +# perform experiments +outputs = [] +for ix, (tf, dr, c, nr, d, k) in enumerate(experiments): + + print('===============================') + print(f'Beginning experiment {ix+1}...') + print(f'Text featurization: {tf}') + print(f'N-gram range: (1, {nr})') + print(f'Dim reduction: {dr}') + print(f'Latent dimension: {d}') + print(f'Clustering algorithm: {c}') + print(f'No. of clusters: {k}') + + if k == 2: + print('\t=> we are doing Hebrew identification') + + + # get the word-feature matrix + featurizer = get_featurizer(tf, all_words, nr) + X = featurizer.fit_transform(all_words) + if not isinstance(X, np.ndarray): + X = X.toarray() + + # dimensionality-reduce it + reducer = get_dim_reducer(dr, X, d) + X_reduced = reducer.fit_transform(X) + + # cluster it + clustering = get_clustering(c, X_reduced, k) + predicted_clusters = clustering.fit_predict(X_reduced) + + output = pd.DataFrame() + output['word'] = all_words + output['predicted_cluster'] = predicted_clusters + output['featurization'] = tf + output['dim_reduction'] = dr + output['latent_dim'] = d + output['clustering'] = c + output['max_ngram'] = nr + output['n_clusters'] = k + output['experiment_ix'] = ix + 1 + + outputs.append(output) + +output_df = pd.concat(outputs, ignore_index=True) +output_df.to_pickle('../../data/etymological-clustering-results-052820.pkl') + -- cgit 1.4.1-2-gfad0