From e8cacedac9259d227063ccf5436f482d8ca03235 Mon Sep 17 00:00:00 2001
From: Jonne <jonnesaleva@brandeis.edu>
Date: Thu, 28 May 2020 01:23:09 -0400
Subject: .

---
 requirements.txt                            |  3 +++
 src/experimental/etymological_clustering.py | 10 ++++++----
 2 files changed, 9 insertions(+), 4 deletions(-)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3046623
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+pandas
+numpy
+scikit-learn
diff --git a/src/experimental/etymological_clustering.py b/src/experimental/etymological_clustering.py
index 13596cc..43a268d 100644
--- a/src/experimental/etymological_clustering.py
+++ b/src/experimental/etymological_clustering.py
@@ -63,24 +63,25 @@ def get_clustering(c, X, k):
 text_featurizers = ['cv', 'tfidf']
 dim_reducers = ['svd', 'lle', 'spectral'] # tsne, isomap, mds
 clusterings = ['kmeans', 'spectral']
-ngram_ranges = [1, 2] # 3
+ngram_ranges = [1, 2, 3]
 Ds = [2, 5, 10]
 Ks = [2,3]
 
-experiments = it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks)
+experiments = [e for e in it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks)]
 
 # load data
 corpus = pd.read_csv('../../data/corpus.csv')
 n_words = corpus.shape[0]
 all_words = flatten([corpus[c].tolist() for c in corpus.columns])
-orthographies ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words
+orthographies = ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words
 
 # perform experiments
 outputs = []
+n_exp = len(experiments)
 for ix, (tf, dr, c, nr, d, k) in enumerate(experiments):
 
     print('===============================')
-    print(f'Beginning experiment {ix+1}...')
+    print(f'Beginning experiment {ix+1} / {n_exp}...')
     print(f'Text featurization: {tf}')
     print(f'N-gram range: (1, {nr})')
     print(f'Dim reduction: {dr}')
@@ -108,6 +109,7 @@ for ix, (tf, dr, c, nr, d, k) in enumerate(experiments):
 
     output = pd.DataFrame()
     output['word'] = all_words
+    output['true_orthography'] = orthographies
     output['predicted_cluster'] = predicted_clusters
     output['featurization'] = tf
     output['dim_reduction'] = dr
-- 
cgit 1.4.1-2-gfad0