-
-
Notifications
You must be signed in to change notification settings - Fork 63
Expand file tree
/
Copy pathlda_basic.py
More file actions
28 lines (24 loc) · 1.03 KB
/
lda_basic.py
File metadata and controls
28 lines (24 loc) · 1.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import sys
import tomotopy as tp
def lda_example(input_file, save_path):
mdl = tp.LDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5, k=20)
for n, line in enumerate(open(input_file, encoding='utf-8')):
ch = line.strip().split()
mdl.add_doc(ch)
mdl.burn_in = 100
mdl.train(0)
print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
print('Removed top words:', mdl.removed_top_words)
print('Training...', file=sys.stderr, flush=True)
mdl.train(1000, show_progress=True)
mdl.summary()
print('Saving...', file=sys.stderr, flush=True)
mdl.save(save_path, True)
for k in range(mdl.k):
print('Topic #{}'.format(k))
for word, prob in mdl.get_topic_words(k):
print('\t', word, prob, sep='\t')
# You can get the sample data file 'enwiki-stemmed-1000.txt'
# at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
print('Running LDA')
lda_example('enwiki-stemmed-1000.txt', 'test.lda.bin')