diff options
-rw-r--r-- | requirements.txt | 3 | ||||
-rw-r--r-- | src/experimental/etymological_clustering.py | 10 |
2 files changed, 9 insertions, 4 deletions
diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3046623 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +pandas +numpy +scikit-learn diff --git a/src/experimental/etymological_clustering.py b/src/experimental/etymological_clustering.py index 13596cc..43a268d 100644 --- a/src/experimental/etymological_clustering.py +++ b/src/experimental/etymological_clustering.py @@ -63,24 +63,25 @@ def get_clustering(c, X, k): text_featurizers = ['cv', 'tfidf'] dim_reducers = ['svd', 'lle', 'spectral'] # tsne, isomap, mds clusterings = ['kmeans', 'spectral'] -ngram_ranges = [1, 2] # 3 +ngram_ranges = [1, 2, 3] Ds = [2, 5, 10] Ks = [2,3] -experiments = it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks) +experiments = [e for e in it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks)] # load data corpus = pd.read_csv('../../data/corpus.csv') n_words = corpus.shape[0] all_words = flatten([corpus[c].tolist() for c in corpus.columns]) -orthographies ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words +orthographies = ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words # perform experiments outputs = [] +n_exp = len(experiments) for ix, (tf, dr, c, nr, d, k) in enumerate(experiments): print('===============================') - print(f'Beginning experiment {ix+1}...') + print(f'Beginning experiment {ix+1} / {n_exp}...') print(f'Text featurization: {tf}') print(f'N-gram range: (1, {nr})') print(f'Dim reduction: {dr}') @@ -108,6 +109,7 @@ for ix, (tf, dr, c, nr, d, k) in enumerate(experiments): output = pd.DataFrame() output['word'] = all_words + output['true_orthography'] = orthographies output['predicted_cluster'] = predicted_clusters output['featurization'] = tf output['dim_reduction'] = dr |