"""
Etymological clustering of Yiddish nouns.
Idea: dimensionality-reduce the ngram count / tfidf matrix
using a variety of methods & cluster the reduced words.
Variables to experiment over:
- CV / TFIDF as count matrix
- n-gram range: 1, 2, 3
- Dim Reduction algo: TruncatedSVD, LLE, T-SNE, spectral, Isomap, MDS
- Dimensionality to reduce down to: 2, 5, 10, 15, 20
- Clustering algorithm: [kmeans, spectral]
- Number of clusters: 2, 3
(c) Jonne Saleva, 2020
"""
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import Isomap, TSNE, MDS, LocallyLinearEmbedding, SpectralEmbedding
from sklearn.cluster import KMeans, SpectralClustering
import itertools as it
import pandas as pd
import numpy as np
# helpers
def flatten(nested):
return [x for x in it.chain.from_iterable(nested)]
def get_featurizer(tf, all_words, nr):
if tf == 'cv':
feat = CountVectorizer(analyzer='char', ngram_range=(1, nr))
elif tf == 'tfidf':
feat = TfidfVectorizer(analyzer='char', ngram_range=(1,nr))
else:
raise ValueError('Invalid featurizer type')
#words = np.array(all_words).reshape(-1, 1)
return feat
def get_dim_reducer(dr, X, d):
inventory = {
'svd': TruncatedSVD,
'lle': LocallyLinearEmbedding,
'tsne': TSNE,
'spectral': SpectralEmbedding,
'isomap': Isomap,
'mds': MDS
}
reducer_cls = inventory[dr]
reducer = reducer_cls(n_components=d)
return reducer
def get_clustering(c, X, k):
inventory = {'kmeans': KMeans,'spectral': SpectralClustering}
clustering_cls = inventory[c]
clustering = clustering_cls(n_clusters=k)
return clustering
# generate experiments
text_featurizers = ['cv', 'tfidf']
dim_reducers = ['svd', 'lle', 'spectral'] # tsne, isomap, mds
clusterings = ['kmeans', 'spectral']
ngram_ranges = [1, 2] # 3
Ds = [2, 5, 10]
Ks = [2,3]
experiments = it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks)
# load data
corpus = pd.read_csv('../../data/corpus.csv')
n_words = corpus.shape[0]
all_words = flatten([corpus[c].tolist() for c in corpus.columns])
orthographies ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words
# perform experiments
outputs = []
for ix, (tf, dr, c, nr, d, k) in enumerate(experiments):
print('===============================')
print(f'Beginning experiment {ix+1}...')
print(f'Text featurization: {tf}')
print(f'N-gram range: (1, {nr})')
print(f'Dim reduction: {dr}')
print(f'Latent dimension: {d}')
print(f'Clustering algorithm: {c}')
print(f'No. of clusters: {k}')
if k == 2:
print('\t=> we are doing Hebrew identification')
# get the word-feature matrix
featurizer = get_featurizer(tf, all_words, nr)
X = featurizer.fit_transform(all_words)
if not isinstance(X, np.ndarray):
X = X.toarray()
# dimensionality-reduce it
reducer = get_dim_reducer(dr, X, d)
X_reduced = reducer.fit_transform(X)
# cluster it
clustering = get_clustering(c, X_reduced, k)
predicted_clusters = clustering.fit_predict(X_reduced)
output = pd.DataFrame()
output['word'] = all_words
output['predicted_cluster'] = predicted_clusters
output['featurization'] = tf
output['dim_reduction'] = dr
output['latent_dim'] = d
output['clustering'] = c
output['max_ngram'] = nr
output['n_clusters'] = k
output['experiment_ix'] = ix + 1
outputs.append(output)
output_df = pd.concat(outputs, ignore_index=True)
output_df.to_pickle('../../data/etymological-clustering-results-052820.pkl')