diff options
author | Jonne <jonne@jonnesaleva.com> | 2020-05-27 22:25:42 -0400 |
---|---|---|
committer | Jonne <jonne@jonnesaleva.com> | 2020-05-27 22:25:42 -0400 |
commit | 3964eecc4cf576035afcca635efc1a41f73f0bdd (patch) | |
tree | 42ba3ca6f4ebddd052e828b11d2fb3cda9833f0d /src | |
parent | dd8a0175a34cddd748cb4fdd9485c314b36e19cc (diff) | |
download | yi-word-clustering-3964eecc4cf576035afcca635efc1a41f73f0bdd.tar.gz |
first clustering experiments -> seems to capture orthography, not so much etymological variation. maybe bigrams too short?
Diffstat (limited to 'src')
-rw-r--r-- | src/experimental/__pycache__/etymological_clustering.cpython-37.pyc | bin | 0 -> 1836 bytes | |||
-rw-r--r-- | src/experimental/etymological_clustering.py | 124 |
2 files changed, 124 insertions, 0 deletions
diff --git a/src/experimental/__pycache__/etymological_clustering.cpython-37.pyc b/src/experimental/__pycache__/etymological_clustering.cpython-37.pyc new file mode 100644 index 0000000..b6940fd --- /dev/null +++ b/src/experimental/__pycache__/etymological_clustering.cpython-37.pyc Binary files differdiff --git a/src/experimental/etymological_clustering.py b/src/experimental/etymological_clustering.py new file mode 100644 index 0000000..13596cc --- /dev/null +++ b/src/experimental/etymological_clustering.py @@ -0,0 +1,124 @@ +""" +Etymological clustering of Yiddish nouns. + +Idea: dimensionality-reduce the ngram count / tfidf matrix + using a variety of methods & cluster the reduced words. + +Variables to experiment over: + - CV / TFIDF as count matrix + - n-gram range: 1, 2, 3 + - Dim Reduction algo: TruncatedSVD, LLE, T-SNE, spectral, Isomap, MDS + - Dimensionality to reduce down to: 2, 5, 10, 15, 20 + - Clustering algorithm: [kmeans, spectral] + - Number of clusters: 2, 3 + +(c) Jonne Saleva, 2020 +""" + +from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer +from sklearn.decomposition import TruncatedSVD +from sklearn.manifold import Isomap, TSNE, MDS, LocallyLinearEmbedding, SpectralEmbedding +from sklearn.cluster import KMeans, SpectralClustering +import itertools as it +import pandas as pd +import numpy as np + +# helpers +def flatten(nested): + return [x for x in it.chain.from_iterable(nested)] + +def get_featurizer(tf, all_words, nr): + if tf == 'cv': + feat = CountVectorizer(analyzer='char', ngram_range=(1, nr)) + elif tf == 'tfidf': + feat = TfidfVectorizer(analyzer='char', ngram_range=(1,nr)) + else: + raise ValueError('Invalid featurizer type') + + #words = np.array(all_words).reshape(-1, 1) + return feat + +def get_dim_reducer(dr, X, d): + + inventory = { + 'svd': TruncatedSVD, + 'lle': LocallyLinearEmbedding, + 'tsne': TSNE, + 'spectral': SpectralEmbedding, + 'isomap': Isomap, + 'mds': MDS + } + reducer_cls = inventory[dr] + reducer = reducer_cls(n_components=d) + return reducer + +def get_clustering(c, X, k): + + inventory = {'kmeans': KMeans,'spectral': SpectralClustering} + clustering_cls = inventory[c] + clustering = clustering_cls(n_clusters=k) + return clustering + +# generate experiments +text_featurizers = ['cv', 'tfidf'] +dim_reducers = ['svd', 'lle', 'spectral'] # tsne, isomap, mds +clusterings = ['kmeans', 'spectral'] +ngram_ranges = [1, 2] # 3 +Ds = [2, 5, 10] +Ks = [2,3] + +experiments = it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks) + +# load data +corpus = pd.read_csv('../../data/corpus.csv') +n_words = corpus.shape[0] +all_words = flatten([corpus[c].tolist() for c in corpus.columns]) +orthographies ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words + +# perform experiments +outputs = [] +for ix, (tf, dr, c, nr, d, k) in enumerate(experiments): + + print('===============================') + print(f'Beginning experiment {ix+1}...') + print(f'Text featurization: {tf}') + print(f'N-gram range: (1, {nr})') + print(f'Dim reduction: {dr}') + print(f'Latent dimension: {d}') + print(f'Clustering algorithm: {c}') + print(f'No. of clusters: {k}') + + if k == 2: + print('\t=> we are doing Hebrew identification') + + + # get the word-feature matrix + featurizer = get_featurizer(tf, all_words, nr) + X = featurizer.fit_transform(all_words) + if not isinstance(X, np.ndarray): + X = X.toarray() + + # dimensionality-reduce it + reducer = get_dim_reducer(dr, X, d) + X_reduced = reducer.fit_transform(X) + + # cluster it + clustering = get_clustering(c, X_reduced, k) + predicted_clusters = clustering.fit_predict(X_reduced) + + output = pd.DataFrame() + output['word'] = all_words + output['predicted_cluster'] = predicted_clusters + output['featurization'] = tf + output['dim_reduction'] = dr + output['latent_dim'] = d + output['clustering'] = c + output['max_ngram'] = nr + output['n_clusters'] = k + output['experiment_ix'] = ix + 1 + + outputs.append(output) + +output_df = pd.concat(outputs, ignore_index=True) +output_df.to_pickle('../../data/etymological-clustering-results-052820.pkl') + |