1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
"""
Etymological clustering of Yiddish nouns.
Idea: dimensionality-reduce the ngram count / tfidf matrix
using a variety of methods & cluster the reduced words.
Variables to experiment over:
- CV / TFIDF as count matrix
- n-gram range: 1, 2, 3
- Dim Reduction algo: TruncatedSVD, LLE, T-SNE, spectral, Isomap, MDS
- Dimensionality to reduce down to: 2, 5, 10, 15, 20
- Clustering algorithm: [kmeans, spectral]
- Number of clusters: 2, 3
(c) Jonne Saleva, 2020
"""
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import Isomap, TSNE, MDS, LocallyLinearEmbedding, SpectralEmbedding
from sklearn.cluster import KMeans, SpectralClustering
import itertools as it
import pandas as pd
import numpy as np
# helpers
def flatten(nested):
return [x for x in it.chain.from_iterable(nested)]
def get_featurizer(tf, all_words, nr):
if tf == 'cv':
feat = CountVectorizer(analyzer='char', ngram_range=(1, nr))
elif tf == 'tfidf':
feat = TfidfVectorizer(analyzer='char', ngram_range=(1,nr))
else:
raise ValueError('Invalid featurizer type')
#words = np.array(all_words).reshape(-1, 1)
return feat
def get_dim_reducer(dr, X, d):
inventory = {
'svd': TruncatedSVD,
'lle': LocallyLinearEmbedding,
'tsne': TSNE,
'spectral': SpectralEmbedding,
'isomap': Isomap,
'mds': MDS
}
reducer_cls = inventory[dr]
reducer = reducer_cls(n_components=d)
return reducer
def get_clustering(c, X, k):
inventory = {'kmeans': KMeans,'spectral': SpectralClustering}
clustering_cls = inventory[c]
clustering = clustering_cls(n_clusters=k)
return clustering
# generate experiments
text_featurizers = ['cv', 'tfidf']
dim_reducers = ['svd', 'lle', 'spectral'] # tsne, isomap, mds
clusterings = ['kmeans', 'spectral']
ngram_ranges = [1, 2, 3]
Ds = [2, 5, 10]
Ks = [2,3]
experiments = [e for e in it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks)]
# load data
corpus = pd.read_csv('../../data/corpus.csv')
n_words = corpus.shape[0]
all_words = flatten([corpus[c].tolist() for c in corpus.columns])
orthographies = ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words
# perform experiments
outputs = []
n_exp = len(experiments)
for ix, (tf, dr, c, nr, d, k) in enumerate(experiments):
print('===============================')
print(f'Beginning experiment {ix+1} / {n_exp}...')
print(f'Text featurization: {tf}')
print(f'N-gram range: (1, {nr})')
print(f'Dim reduction: {dr}')
print(f'Latent dimension: {d}')
print(f'Clustering algorithm: {c}')
print(f'No. of clusters: {k}')
if k == 2:
print('\t=> we are doing Hebrew identification')
# get the word-feature matrix
featurizer = get_featurizer(tf, all_words, nr)
X = featurizer.fit_transform(all_words)
if not isinstance(X, np.ndarray):
X = X.toarray()
# dimensionality-reduce it
reducer = get_dim_reducer(dr, X, d)
X_reduced = reducer.fit_transform(X)
# cluster it
clustering = get_clustering(c, X_reduced, k)
predicted_clusters = clustering.fit_predict(X_reduced)
output = pd.DataFrame()
output['word'] = all_words
output['true_orthography'] = orthographies
output['predicted_cluster'] = predicted_clusters
output['featurization'] = tf
output['dim_reduction'] = dr
output['latent_dim'] = d
output['clustering'] = c
output['max_ngram'] = nr
output['n_clusters'] = k
output['experiment_ix'] = ix + 1
outputs.append(output)
output_df = pd.concat(outputs, ignore_index=True)
output_df.to_pickle('../../data/etymological-clustering-results-052820.pkl')
|