about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorJonne <jonnesaleva@brandeis.edu>2020-05-28 01:23:09 -0400
committerJonne <jonnesaleva@brandeis.edu>2020-05-28 01:23:09 -0400
commite8cacedac9259d227063ccf5436f482d8ca03235 (patch)
treef2b5fd5bf7041fa4cc4c800cf1e5c21e27f07dcb
parent3964eecc4cf576035afcca635efc1a41f73f0bdd (diff)
downloadyi-word-clustering-e8cacedac9259d227063ccf5436f482d8ca03235.tar.gz
-rw-r--r--requirements.txt3
-rw-r--r--src/experimental/etymological_clustering.py10
2 files changed, 9 insertions, 4 deletions
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3046623
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+pandas
+numpy
+scikit-learn
diff --git a/src/experimental/etymological_clustering.py b/src/experimental/etymological_clustering.py
index 13596cc..43a268d 100644
--- a/src/experimental/etymological_clustering.py
+++ b/src/experimental/etymological_clustering.py
@@ -63,24 +63,25 @@ def get_clustering(c, X, k):
 text_featurizers = ['cv', 'tfidf']
 dim_reducers = ['svd', 'lle', 'spectral'] # tsne, isomap, mds
 clusterings = ['kmeans', 'spectral']
-ngram_ranges = [1, 2] # 3
+ngram_ranges = [1, 2, 3]
 Ds = [2, 5, 10]
 Ks = [2,3]
 
-experiments = it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks)
+experiments = [e for e in it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks)]
 
 # load data
 corpus = pd.read_csv('../../data/corpus.csv')
 n_words = corpus.shape[0]
 all_words = flatten([corpus[c].tolist() for c in corpus.columns])
-orthographies ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words
+orthographies = ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words
 
 # perform experiments
 outputs = []
+n_exp = len(experiments)
 for ix, (tf, dr, c, nr, d, k) in enumerate(experiments):
 
     print('===============================')
-    print(f'Beginning experiment {ix+1}...')
+    print(f'Beginning experiment {ix+1} / {n_exp}...')
     print(f'Text featurization: {tf}')
     print(f'N-gram range: (1, {nr})')
     print(f'Dim reduction: {dr}')
@@ -108,6 +109,7 @@ for ix, (tf, dr, c, nr, d, k) in enumerate(experiments):
 
     output = pd.DataFrame()
     output['word'] = all_words
+    output['true_orthography'] = orthographies
     output['predicted_cluster'] = predicted_clusters
     output['featurization'] = tf
     output['dim_reduction'] = dr