diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3046623
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+pandas
+numpy
+scikit-learn
diff --git a/src/experimental/etymological_clustering.py b/src/experimental/etymological_clustering.py
index 13596cc..43a268d 100644
--- a/src/experimental/etymological_clustering.py
+++ b/src/experimental/etymological_clustering.py
@@ -63,24 +63,25 @@ def get_clustering(c, X, k):
text_featurizers = ['cv', 'tfidf']
dim_reducers = ['svd', 'lle', 'spectral'] # tsne, isomap, mds
clusterings = ['kmeans', 'spectral']
-ngram_ranges = [1, 2] # 3
+ngram_ranges = [1, 2, 3]
Ds = [2, 5, 10]
Ks = [2,3]
-experiments = it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks)
+experiments = [e for e in it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks)]
# load data
corpus = pd.read_csv('../../data/corpus.csv')
n_words = corpus.shape[0]
all_words = flatten([corpus[c].tolist() for c in corpus.columns])
-orthographies ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words
+orthographies = ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words
# perform experiments
outputs = []
+n_exp = len(experiments)
for ix, (tf, dr, c, nr, d, k) in enumerate(experiments):
print('===============================')
- print(f'Beginning experiment {ix+1}...')
+ print(f'Beginning experiment {ix+1} / {n_exp}...')
print(f'Text featurization: {tf}')
print(f'N-gram range: (1, {nr})')
print(f'Dim reduction: {dr}')
@@ -108,6 +109,7 @@ for ix, (tf, dr, c, nr, d, k) in enumerate(experiments):
output = pd.DataFrame()
output['word'] = all_words
+ output['true_orthography'] = orthographies
output['predicted_cluster'] = predicted_clusters
output['featurization'] = tf
output['dim_reduction'] = dr
|