about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--data/etymological-kmeans-clustering-results-052720.pklbin0 -> 14311382 bytes
-rw-r--r--src/experimental/__pycache__/etymological_clustering.cpython-37.pycbin0 -> 1836 bytes
-rw-r--r--src/experimental/etymological_clustering.py124
3 files changed, 124 insertions, 0 deletions
diff --git a/data/etymological-kmeans-clustering-results-052720.pkl b/data/etymological-kmeans-clustering-results-052720.pkl
new file mode 100644
index 0000000..efee4a9
--- /dev/null
+++ b/data/etymological-kmeans-clustering-results-052720.pkl
Binary files differdiff --git a/src/experimental/__pycache__/etymological_clustering.cpython-37.pyc b/src/experimental/__pycache__/etymological_clustering.cpython-37.pyc
new file mode 100644
index 0000000..b6940fd
--- /dev/null
+++ b/src/experimental/__pycache__/etymological_clustering.cpython-37.pyc
Binary files differdiff --git a/src/experimental/etymological_clustering.py b/src/experimental/etymological_clustering.py
new file mode 100644
index 0000000..13596cc
--- /dev/null
+++ b/src/experimental/etymological_clustering.py
@@ -0,0 +1,124 @@
+"""
+Etymological clustering of Yiddish nouns.
+
+Idea: dimensionality-reduce the ngram count / tfidf matrix
+      using a variety of methods & cluster the reduced words.
+
+Variables to experiment over:
+    - CV / TFIDF as count matrix
+    - n-gram range: 1, 2, 3
+    - Dim Reduction algo: TruncatedSVD, LLE, T-SNE, spectral, Isomap, MDS
+    - Dimensionality to reduce down to: 2, 5, 10, 15, 20
+    - Clustering algorithm: [kmeans, spectral]
+    - Number of clusters: 2, 3
+
+(c) Jonne Saleva, 2020
+"""
+
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.decomposition import TruncatedSVD
+from sklearn.manifold import Isomap, TSNE, MDS, LocallyLinearEmbedding, SpectralEmbedding
+from sklearn.cluster import KMeans, SpectralClustering
+import itertools as it
+import pandas as pd 
+import numpy as np
+
+# helpers
+def flatten(nested):
+    return [x for x in it.chain.from_iterable(nested)]
+
+def get_featurizer(tf, all_words, nr):
+    if tf == 'cv':
+        feat = CountVectorizer(analyzer='char', ngram_range=(1, nr))
+    elif tf == 'tfidf':
+        feat = TfidfVectorizer(analyzer='char', ngram_range=(1,nr))
+    else:
+        raise ValueError('Invalid featurizer type')
+
+    #words = np.array(all_words).reshape(-1, 1)
+    return feat
+
+def get_dim_reducer(dr, X, d):
+
+    inventory = {
+        'svd': TruncatedSVD, 
+        'lle': LocallyLinearEmbedding, 
+        'tsne': TSNE, 
+        'spectral': SpectralEmbedding, 
+        'isomap': Isomap, 
+        'mds': MDS
+    }
+    reducer_cls = inventory[dr]
+    reducer = reducer_cls(n_components=d)
+    return reducer
+
+def get_clustering(c, X, k):
+
+    inventory = {'kmeans': KMeans,'spectral': SpectralClustering}
+    clustering_cls = inventory[c]
+    clustering = clustering_cls(n_clusters=k)
+    return clustering
+
+# generate experiments
+text_featurizers = ['cv', 'tfidf']
+dim_reducers = ['svd', 'lle', 'spectral'] # tsne, isomap, mds
+clusterings = ['kmeans', 'spectral']
+ngram_ranges = [1, 2] # 3
+Ds = [2, 5, 10]
+Ks = [2,3]
+
+experiments = it.product(text_featurizers, dim_reducers, clusterings, ngram_ranges, Ds, Ks)
+
+# load data
+corpus = pd.read_csv('../../data/corpus.csv')
+n_words = corpus.shape[0]
+all_words = flatten([corpus[c].tolist() for c in corpus.columns])
+orthographies ['romanized']*n_words + ['yivo']*n_words + ['chasidic']*n_words
+
+# perform experiments
+outputs = []
+for ix, (tf, dr, c, nr, d, k) in enumerate(experiments):
+
+    print('===============================')
+    print(f'Beginning experiment {ix+1}...')
+    print(f'Text featurization: {tf}')
+    print(f'N-gram range: (1, {nr})')
+    print(f'Dim reduction: {dr}')
+    print(f'Latent dimension: {d}')
+    print(f'Clustering algorithm: {c}')
+    print(f'No. of clusters: {k}')
+
+    if k == 2:
+        print('\t=> we are doing Hebrew identification')
+
+
+    # get the word-feature matrix
+    featurizer = get_featurizer(tf, all_words, nr)
+    X = featurizer.fit_transform(all_words)
+    if not isinstance(X, np.ndarray):
+        X = X.toarray()
+
+    # dimensionality-reduce it
+    reducer = get_dim_reducer(dr, X, d)
+    X_reduced = reducer.fit_transform(X)
+
+    # cluster it
+    clustering = get_clustering(c, X_reduced, k)
+    predicted_clusters = clustering.fit_predict(X_reduced)
+
+    output = pd.DataFrame()
+    output['word'] = all_words
+    output['predicted_cluster'] = predicted_clusters
+    output['featurization'] = tf
+    output['dim_reduction'] = dr
+    output['latent_dim'] = d
+    output['clustering'] = c
+    output['max_ngram'] = nr
+    output['n_clusters'] = k
+    output['experiment_ix'] = ix + 1
+
+    outputs.append(output)
+
+output_df = pd.concat(outputs, ignore_index=True)
+output_df.to_pickle('../../data/etymological-clustering-results-052820.pkl')
+
11703fdbdd132f6b64b12ccb834a624bb22f'>^
fc2046a1 ^
1ead3562 ^
38f0b91a ^
3e1349d2 ^



b38d7fff ^

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66





                                              

                                                                             




                                                              






                                                                    

                                    

                                                           
 

                                                                                      

                                                                                      


                                  
 


                                                           
                            
                              



                                                    
             








                                                                
                                 
                                                                                                                                                                                    
 
                                                                      
                                                                           
                                                                                    
                                                                                                                                                                                                                                                                                               
                 



                                                                                                                          

                     
" Vim syntax file
" Language:    mu
" Maintainer:  Kartik Agaram <mu@akkartik.com>
" URL:         http://github.com/akkartik/mu
" License:     public domain
"
" Copy this into your ftplugin directory, and add the following to your vimrc
" or to .vim/ftdetect/mu.vim:
"   autocmd BufReadPost,BufNewFile *.mu,*.test set filetype=mu

let s:save_cpo = &cpo
set cpo&vim

" todo: why does this periodically lose syntax, like on file reload?
"   $ vim x.mu
"   :e
"? if exists("b:syntax")
"?   finish
"? endif
"? let b:syntax = "mu"

setlocal iskeyword=@,48-57,?,!,_,$,-
setlocal formatoptions-=t  " mu programs have long lines
setlocal formatoptions+=c  " but comments should still wrap

syntax match muComment /#.*$/ | highlight link muComment Comment
syntax match muSalientComment /##.*$/ | highlight link muSalientComment SalientComment
syntax match muComment /;.*$/ | highlight link muComment Comment
syntax match muSalientComment /;;.*$/ | highlight link muSalientComment SalientComment
set comments+=n:#
syntax match CommentedCode "#? .*"
let b:cmt_head = "#? "

" mu strings are inside [ ... ] and can span multiple lines
" don't match '[' at end of line, that's usually code
syntax region muString start=+\[[^\]]+ end=+\]+
syntax match muString "\[\]"
highlight link muString String
" mu syntax for representing the screen in scenarios
syntax region muScreen start=+ \.+ end=+\.$\|$+
highlight link muScreen muString

" mu literals
syntax match muLiteral %[^ ]\+:literal/[^ ,]*\|[^ ]\+:literal\>%
syntax match muLiteral %\<[0-9-]\?[0-9]\+\>%
syntax match muLiteral %\<[0-9-]\?[0-9]\+/[^ ,]*%
syntax match muLiteral "^\s\+[^ 0-9a-zA-Z{}#\[\]][^ ]*\s*$"
syntax match muLiteral %[^ ]\+:label/[^ ,]*\|[^ ]\+:label\>%
syntax match muLiteral "<[^ ]*>"
syntax match muLiteral %[^ ]\+:type/[^ ,]*\|[^ ]\+:type\>%
syntax match muLiteral %[^ ]\+:offset/[^ ,]*\|[^ ]\+:offset\>%
syntax match muLiteral %[^ ]\+:variant/[^ ,]*\|[^ ]\+:variant\>%
highlight link muLiteral Constant
syntax keyword muKeyword default-space global-space new-default-space local-scope next-ingredient ingredient rewind-ingredients load-ingredients | highlight link muKeyword Constant

syntax match muDelimiter "[{}]" | highlight link muDelimiter Delimiter
syntax match muAssign " <- \|\<raw\>" | highlight link muAssign SpecialChar
syntax match muGlobal %[^ ]\+:global/\?[^ ,]*% | highlight link muGlobal SpecialChar
syntax keyword muControl reply reply-if reply-unless return return-if return-unless jump jump-if jump-unless loop loop-if loop-unless break break-if break-unless current-continuation continue-from create-delimited-continuation reply-delimited-continuation | highlight muControl ctermfg=3
" common keywords
syntax match muRecipe "^recipe\>\|^recipe!\>\|^def\>\|^def!\>\|^before\>\|^after\>\| -> " | highlight muRecipe ctermfg=208
syntax match muScenario "^scenario\>" | highlight muScenario ctermfg=34
syntax match muPendingScenario "^pending-scenario\>" | highlight link muPendingScenario SpecialChar
syntax match muData "^type\>\|^container\>\|^exclusive-container\>" | highlight muData ctermfg=226

let &cpo = s:save_cpo