src/experimental/etymological_clustering.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

"""
Etymological clustering of Yiddish nouns.

Idea: dimensionality-reduce the ngram count / tfidf matrix
      using a variety of methods & cluster the reduced words.

Variables to experiment over:
    - CV / TFIDF as count matrix
    - n-gram range: 1, 2, 3
    - Dim Reduction algo: TruncatedSVD, LLE, T-SNE, spectral, Isomap, MDS
    - Dimensionality to reduce down to: 2, 5, 10, 15, 20
    - Clustering algorithm: [kmeans, spectral]
    - Number of clusters: 2, 3

(c) Jonne Saleva, 2020
"""

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import Isomap, TSNE, MDS, LocallyLinearEmbedding, SpectralEmbedding
from sklearn.cluster import KMeans, SpectralClustering
import itertools as it
import pandas as pd 
import numpy as np

# helpers
def flatten(nested):
    return [x for x in it.chain.from_iterable(nested)]

def get_featurizer(tf, all_words, nr):
    if tf == 'cv':
        feat = CountVectorizer(analyzer='char', ngram_range=(1, nr))
    elif tf == 'tfidf':
        feat = TfidfVectorizer(analyzer='char', ngram_range=(1,nr))
    else:
        raise ValueError('Invalid featurizer type')

    #words = np.array(all_words).reshape(-1, 1)
    return feat

def get_dim_reducer(dr, X, d):

    inventory = {
        'svd': TruncatedSVD, 
        'lle': LocallyLinearEmbedding, 
        'tsne': TSNE, 
        'spectral': SpectralEmbedding, 
        'isomap': Isomap, 
        'mds': MDS
    }
    reducer_cls = inventory[dr]
    reducer = reducer_cls(n_components=d)
    return reducer

def get_clustering(c, X, k):

    inventory = {'kmeans': KMeans,'spectral': SpectralClustering}
    clustering_cls = inventory[c]
    clustering = clustering_cls(n_clusters=k)
    return clustering

# generate experiments
text_featurizers = ['cv', 'tfidf']
dim_reducers = ['svd', 'lle', 'spectral'] # tsne, isomap, mds
clusterings = ['kmeans', 'spectral']
ngram_ranges = [1, 2, 3]
Ds = [2, 5, 10]
Ks = [2,3]

experiments = [e for e in it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks)]

# load data
corpus = pd.read_csv('../../data/corpus.csv')
n_words = corpus.shape[0]
all_words = flatten([corpus[c].tolist() for c in corpus.columns])
orthographies = ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words

# perform experiments
outputs = []
n_exp = len(experiments)
for ix, (tf, dr, c, nr, d, k) in enumerate(experiments):

    print('===============================')
    print(f'Beginning experiment {ix+1} / {n_exp}...')
    print(f'Text featurization: {tf}')
    print(f'N-gram range: (1, {nr})')
    print(f'Dim reduction: {dr}')
    print(f'Latent dimension: {d}')
    print(f'Clustering algorithm: {c}')
    print(f'No. of clusters: {k}')

    if k == 2:
        print('\t=> we are doing Hebrew identification')


    # get the word-feature matrix
    featurizer = get_featurizer(tf, all_words, nr)
    X = featurizer.fit_transform(all_words)
    if not isinstance(X, np.ndarray):
        X = X.toarray()

    # dimensionality-reduce it
    reducer = get_dim_reducer(dr, X, d)
    X_reduced = reducer.fit_transform(X)

    # cluster it
    clustering = get_clustering(c, X_reduced, k)
    predicted_clusters = clustering.fit_predict(X_reduced)

    output = pd.DataFrame()
    output['word'] = all_words
    output['true_orthography'] = orthographies
    output['predicted_cluster'] = predicted_clusters
    output['featurization'] = tf
    output['dim_reduction'] = dr
    output['latent_dim'] = d
    output['clustering'] = c
    output['max_ngram'] = nr
    output['n_clusters'] = k
    output['experiment_ix'] = ix + 1

    outputs.append(output)

output_df = pd.concat(outputs, ignore_index=True)
output_df.to_pickle('../../data/etymological-clustering-results-052820.pkl')