""" Etymological clustering of Yiddish nouns. Idea: dimensionality-reduce the ngram count / tfidf matrix using a variety of methods & cluster the reduced words. Variables to experiment over: - CV / TFIDF as count matrix - n-gram range: 1, 2, 3 - Dim Reduction algo: TruncatedSVD, LLE, T-SNE, spectral, Isomap, MDS - Dimensionality to reduce down to: 2, 5, 10, 15, 20 - Clustering algorithm: [kmeans, spectral] - Number of clusters: 2, 3 (c) Jonne Saleva, 2020 """ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.decomposition import TruncatedSVD from sklearn.manifold import Isomap, TSNE, MDS, LocallyLinearEmbedding, SpectralEmbedding from sklearn.cluster import KMeans, SpectralClustering import itertools as it import pandas as pd import numpy as np # helpers def flatten(nested): return [x for x in it.chain.from_iterable(nested)] def get_featurizer(tf, all_words, nr): if tf == 'cv': feat = CountVectorizer(analyzer='char', ngram_range=(1, nr)) elif tf == 'tfidf': feat = TfidfVectorizer(analyzer='char', ngram_range=(1,nr)) else: raise ValueError('Invalid featurizer type') #words = np.array(all_words).reshape(-1, 1) return feat def get_dim_reducer(dr, X, d): inventory = { 'svd': TruncatedSVD, 'lle': LocallyLinearEmbedding, 'tsne': TSNE, 'spectral': SpectralEmbedding, 'isomap': Isomap, 'mds': MDS } reducer_cls = inventory[dr] reducer = reducer_cls(n_components=d) return reducer def get_clustering(c, X, k): inventory = {'kmeans': KMeans,'spectral': SpectralClustering} clustering_cls = inventory[c] clustering = clustering_cls(n_clusters=k) return clustering # generate experiments text_featurizers = ['cv', 'tfidf'] dim_reducers = ['svd', 'lle', 'spectral'] # tsne, isomap, mds clusterings = ['kmeans', 'spectral'] ngram_ranges = [1, 2] # 3 Ds = [2, 5, 10] Ks = [2,3] experiments = it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks) # load data corpus = pd.read_csv('../../data/corpus.csv') n_words = corpus.shape[0] all_words = flatten([corpus[c].tolist() for c in corpus.columns]) orthographies ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words # perform experiments outputs = [] for ix, (tf, dr, c, nr, d, k) in enumerate(experiments): print('===============================') print(f'Beginning experiment {ix+1}...') print(f'Text featurization: {tf}') print(f'N-gram range: (1, {nr})') print(f'Dim reduction: {dr}') print(f'Latent dimension: {d}') print(f'Clustering algorithm: {c}') print(f'No. of clusters: {k}') if k == 2: print('\t=> we are doing Hebrew identification') # get the word-feature matrix featurizer = get_featurizer(tf, all_words, nr) X = featurizer.fit_transform(all_words) if not isinstance(X, np.ndarray): X = X.toarray() # dimensionality-reduce it reducer = get_dim_reducer(dr, X, d) X_reduced = reducer.fit_transform(X) # cluster it clustering = get_clustering(c, X_reduced, k) predicted_clusters = clustering.fit_predict(X_reduced) output = pd.DataFrame() output['word'] = all_words output['predicted_cluster'] = predicted_clusters output['featurization'] = tf output['dim_reduction'] = dr output['latent_dim'] = d output['clustering'] = c output['max_ngram'] = nr output['n_clusters'] = k output['experiment_ix'] = ix + 1 outputs.append(output) output_df = pd.concat(outputs, ignore_index=True) output_df.to_pickle('../../data/etymological-clustering-results-052820.pkl')