diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index d4a2c4405..000000000 --- a/.editorconfig +++ /dev/null @@ -1,21 +0,0 @@ -# http://editorconfig.org - -root = true - -[*] -indent_style = space -indent_size = 4 -trim_trailing_whitespace = true -insert_final_newline = true -charset = utf-8 -end_of_line = lf - -[*.bat] -indent_style = tab -end_of_line = crlf - -[LICENSE] -insert_final_newline = false - -[Makefile] -indent_style = tab diff --git "a/docs/\340\270\247\340\270\264\340\270\230\340\270\265\340\270\225\340\270\264\340\270\224\340\270\225\340\270\261\340\271\211\340\270\207 PyThaiNLP \340\271\200\340\270\247\340\270\212\340\270\261\340\271\210\340\270\231\340\270\245\340\271\210\340\270\262\340\270\252\340\270\270\340\270\224\340\270\210\340\270\262\340\270\201 GitHub.md" "b/docs/\340\270\247\340\270\264\340\270\230\340\270\265\340\270\225\340\270\264\340\270\224\340\270\225\340\270\261\340\271\211\340\270\207 PyThaiNLP \340\271\200\340\270\247\340\270\212\340\270\261\340\271\210\340\270\231\340\270\245\340\271\210\340\270\262\340\270\252\340\270\270\340\270\224\340\270\210\340\270\262\340\270\201 GitHub.md" deleted file mode 100644 index 2357c6ccb..000000000 --- "a/docs/\340\270\247\340\270\264\340\270\230\340\270\265\340\270\225\340\270\264\340\270\224\340\270\225\340\270\261\340\271\211\340\270\207 PyThaiNLP \340\271\200\340\270\247\340\270\212\340\270\261\340\271\210\340\270\231\340\270\245\340\271\210\340\270\262\340\270\252\340\270\270\340\270\224\340\270\210\340\270\262\340\270\201 GitHub.md" +++ /dev/null @@ -1,7 +0,0 @@ -# วิธีติดตั้ง PyThaiNLP เวชั่นล่าสุดจาก GitHub - -ใช้คำสั่งนี้ในคอมมาไลน์ - -``` -pip install -U https://github.com/wannaphongcom/pythainlp/archive/pythainlp1.4.zip -``` \ No newline at end of file diff --git a/examples/collation.py b/examples/collation.py index f78b7ce77..80e66f0d6 100644 --- a/examples/collation.py +++ b/examples/collation.py @@ -1,3 +1,5 @@ # -*- coding: utf-8 -*- + from pythainlp.collation import collation -print(collation(['ไก่','ไข่','ก','ฮา'])) # ['ก', 'ไก่', 'ไข่', 'ฮา'] \ No newline at end of file + +print(collation(["ไก่", "ไข่", "ก", "ฮา"])) # ['ก', 'ไก่', 'ไข่', 'ฮา'] diff --git a/examples/etcc.py b/examples/etcc.py index 6e56f1979..f732fdf11 100644 --- a/examples/etcc.py +++ b/examples/etcc.py @@ -1,3 +1,5 @@ # -*- coding: utf-8 -*- + from pythainlp.tokenize import etcc -print(etcc.etcc('คืนความสุข')) # /คืน/ความสุข \ No newline at end of file + +print(etcc.etcc("คืนความสุข")) # /คืน/ความสุข diff --git a/examples/normalize.py b/examples/normalize.py index c83d6ddb9..cac000306 100644 --- a/examples/normalize.py +++ b/examples/normalize.py @@ -1,3 +1,5 @@ # -*- coding: utf-8 -*- + from pythainlp.util import normalize -print(normalize("เเปลก")=="แปลก") # เ เ ป ล ก กับ แปลก \ No newline at end of file + +print(normalize("เเปลก") == "แปลก") # เ เ ป ล ก กับ แปลก diff --git a/examples/romanization.py b/examples/romanization.py index 78c59f572..38ac4840a 100644 --- a/examples/romanization.py +++ b/examples/romanization.py @@ -1,3 +1,5 @@ # -*- coding: utf-8 -*- + from pythainlp.romanization import romanization -print(romanization("แมว")) \ No newline at end of file + +print(romanization("แมว")) diff --git a/examples/segment.py b/examples/segment.py deleted file mode 100644 index 665b26960..000000000 --- a/examples/segment.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- -from pythainlp.tokenize import word_tokenize -a =u'ฉันรักภาษาไทยเพราะฉันเป็นคนไทยและฉันใช้ภาษาไทย' -b = word_tokenize(a) -print(b) diff --git a/examples/soundex.py b/examples/soundex.py index 98b2d7677..3b8e35c2e 100644 --- a/examples/soundex.py +++ b/examples/soundex.py @@ -1,4 +1,7 @@ # -*- coding: utf-8 -*- -from pythainlp.soundex import LK82,Udom83 -print(LK82('รถ')==LK82('รด')) -print(Udom83('วรร')==Udom83('วัน')) \ No newline at end of file + +from pythainlp.soundex import LK82, Udom83 + +print(LK82("รถ") == LK82("รด")) + +print(Udom83("วรร") == Udom83("วัน")) diff --git a/examples/spell.py b/examples/spell.py index 221c8089e..0d39ff07f 100644 --- a/examples/spell.py +++ b/examples/spell.py @@ -1,4 +1,8 @@ # -*- coding: utf-8 -*- -from pythainlp.spell import * -a=spell("สี่เหลียม") -print(a) # ['สี่เหลี่ยม'] \ No newline at end of file + +from pythainlp.spell import spell + +a = spell("สี่เหลียม") +print(a) # ['สี่เหลี่ยม'] + +# a = spell("สี่เหลียม", engine="hunspell") # available in some Linux systems diff --git a/examples/tcc.py b/examples/tcc.py index b999880f5..4d95aed43 100644 --- a/examples/tcc.py +++ b/examples/tcc.py @@ -1,3 +1,10 @@ # -*- coding: utf-8 -*- + from pythainlp.tokenize import tcc -print(tcc.tcc('ประเทศไทย')) # ป/ระ/เท/ศ/ไท/ย \ No newline at end of file + +print(tcc.tcc("ประเทศไทย")) # ป/ระ/เท/ศ/ไท/ย + +print(tcc.tcc_pos("ประเทศไทย")) # {1, 3, 5, 6, 8, 9} + +for ch in tcc.tcc_gen("ประเทศไทย"): # ป-ระ-เท-ศ-ไท-ย- + print(ch, end='-') diff --git a/examples/tokenize.py b/examples/tokenize.py new file mode 100644 index 000000000..c6b6028e5 --- /dev/null +++ b/examples/tokenize.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +from pythainlp.tokenize import sent_tokenize, word_tokenize + +text = "ฉันรักภาษาไทย เพราะฉันใช้ภาษาไทย " +print(text) + +print(sent_tokenize(text)) +# ['ฉันรักภาษาไทย', 'เพราะฉันใช้ภาษาไทย', ''] + +print(word_tokenize(text)) +# ['ฉัน', 'รัก', 'ภาษาไทย', ' ', 'เพราะ', 'ฉัน', 'ใช้', 'ภาษาไทย', ' '] + +print(word_tokenize(text, whitespaces=False)) +# ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'ใช้', 'ภาษาไทย'] + +text2 = "กฎหมายแรงงาน" +print(text2) + +print(word_tokenize(text2)) +# ['กฎหมายแรงงาน'] + +print(word_tokenize(text2, engine="longest-matching")) +# ['กฎหมาย', 'แรงงาน'] diff --git a/examples/whitespaceTokenizer.py b/examples/whitespaceTokenizer.py deleted file mode 100644 index 89e975282..000000000 --- a/examples/whitespaceTokenizer.py +++ /dev/null @@ -1,3 +0,0 @@ -# -*- coding: utf-8 -*- -from pythainlp.tokenize import WhitespaceTokenizer -print(WhitespaceTokenizer("ทดสอบ ตัดคำช่องว่าง")) \ No newline at end of file diff --git a/pythainlp/corpus/.ipynb_checkpoints/__init__-checkpoint.py b/pythainlp/corpus/.ipynb_checkpoints/__init__-checkpoint.py deleted file mode 100644 index b8361f789..000000000 --- a/pythainlp/corpus/.ipynb_checkpoints/__init__-checkpoint.py +++ /dev/null @@ -1,78 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals -from pythainlp.tools import get_path_db,get_path_data -from tinydb import TinyDB,Query -from future.moves.urllib.request import urlopen -from tqdm import tqdm -import requests -import os -import math -import requests -from nltk.corpus import names -#__all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"] -path_db_=get_path_db() -def get_file(name): - db=TinyDB(path_db_) - temp = Query() - if len(db.search(temp.name==name))>0: - path= get_path_data(db.search(temp.name==name)[0]['file']) - db.close() - if not os.path.exists(path): - download(name) - return path -def download_(url, dst): - """ - @param: url to download file - @param: dst place to put the file - """ - file_size = int(urlopen(url).info().get('Content-Length', -1)) - if os.path.exists(dst): - first_byte = os.path.getsize(dst) - else: - first_byte = 0 - if first_byte >= file_size: - return file_size - header = {"Range": "bytes=%s-%s" % (first_byte, file_size)} - pbar = tqdm( - total=file_size, initial=first_byte, - unit='B', unit_scale=True, desc=url.split('/')[-1]) - req = requests.get(url, headers=header, stream=True) - with(open(get_path_data(dst), 'wb')) as f: - for chunk in req.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - pbar.update(1024) - pbar.close() - #return file_size -def download(name,force=False): - db=TinyDB(path_db_) - temp = Query() - data=requests.get("https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json") - data_json=data.json() - if name in list(data_json.keys()): - temp_name=data_json[name] - print("Download : "+name) - if len(db.search(temp.name==name))==0: - print(name+" "+temp_name['version']) - download_(temp_name['download'],temp_name['file_name']) - db.insert({'name': name, 'version': temp_name['version'],'file':temp_name['file_name']}) - else: - if len(db.search(temp.name==name and temp.version==temp_name['version']))==0: - print("have update") - print("from "+name+" "+db.search(temp.name==name)[0]['version']+" update to "+name+" "+temp_name['version']) - yes_no="y" - if force==False: - yes_no=str(input("y or n : ")).lower() - if "y"==yes_no: - download_(temp_name['download'],temp_name['file_name']) - db.update({'version':temp_name['version']},temp.name==name) - else: - print("re-download") - print("from "+name+" "+db.search(temp.name==name)[0]['version']+" update to "+name+" "+temp_name['version']) - yes_no="y" - if force==False: - yes_no=str(input("y or n : ")).lower() - if "y"==yes_no: - download_(temp_name['download'],temp_name['file_name']) - db.update({'version':temp_name['version']},temp.name==name) - db.close() \ No newline at end of file diff --git a/pythainlp/sentiment/.ipynb_checkpoints/__init__-checkpoint.py b/pythainlp/sentiment/.ipynb_checkpoints/__init__-checkpoint.py deleted file mode 100644 index 77275d0db..000000000 --- a/pythainlp/sentiment/.ipynb_checkpoints/__init__-checkpoint.py +++ /dev/null @@ -1,40 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals,print_function -import six -import sys -if six.PY2: - print("Thai sentiment in pythainlp. Not support python 2.7") - sys.exit(0) -import pythainlp -from pythainlp.corpus import stopwords -import os -from pythainlp.tokenize import word_tokenize -try: - import dill -except ImportError: - import pip - pip.main(['install','dill']) - try: - import dill - except ImportError: - print("Error ! using 'pip install dill'") - sys.exit(0) - -templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'sentiment') -def sentiment(text): - """ - sentiment ภาษาไทย - ใช้ข้อมูลจาก https://github.com/wannaphongcom/lexicon-thai/tree/master/ข้อความ/ - รับค่าสตริง str คืนค่า pos , neg""" - with open(os.path.join(templates_dir, 'vocabulary.data'), 'rb') as in_strm: - vocabulary = dill.load(in_strm) - in_strm.close() - with open(os.path.join(templates_dir, 'sentiment.data'), 'rb') as in_strm: - classifier = dill.load(in_strm) - in_strm.close() - text=set(word_tokenize(text))-set(stopwords.words('thai')) - featurized_test_sentence = {i:(i in text) for i in vocabulary} - return classifier.classify(featurized_test_sentence) -if __name__ == '__main__': - d="เสียใจแย่มากเลย" - print(sentiment(d)) \ No newline at end of file diff --git a/pythainlp/sentiment/.ipynb_checkpoints/build_pythainlp-checkpoint.tool b/pythainlp/sentiment/.ipynb_checkpoints/build_pythainlp-checkpoint.tool deleted file mode 100644 index b7a7f7107..000000000 --- a/pythainlp/sentiment/.ipynb_checkpoints/build_pythainlp-checkpoint.tool +++ /dev/null @@ -1,48 +0,0 @@ -# -*- coding: utf-8 -*- -# เครื่องมือในการ build sentiment เพื่อใช้ใน pythainlp -# เวชั่น 0.3 -# 2018/01/18 -# เขียนโดย นาย วรรณพงษ์ ภัททิยไพบูลย์ -# cc-by 3.0 Thai Sentiment Text https://github.com/wannaphongcom/lexicon-thai/tree/master/ข้อความ/ -# อ่านบทความได้ที่ https://python3.wannaphong.com/2017/02/ทำ-sentiment-analysis-ภาษาไทยใน-python.html -from nltk import NaiveBayesClassifier as nbc -import dill -from pythainlp.tokenize import word_tokenize -from pythainlp.corpus import stopwords -import codecs -from itertools import chain -thaistopwords = stopwords.words('thai') -# pos.txt -with codecs.open('pos.txt', 'r', "utf-8") as f: - lines = f.readlines() -listpos=[x for x in [e.strip() for e in lines] if x not in thaistopwords] -del lines -f.close() # ปิดไฟล์ -# neg.txt -with codecs.open('neg.txt', 'r', "utf-8") as f: - lines = f.readlines() -listneg=[x for x in [e.strip() for e in lines] if x not in thaistopwords] -f.close() # ปิดไฟล์ -print(1) -pos1=['pos']*len(listpos) -neg1=['neg']*len(listneg) -print(2) -training_data = list(zip(listpos,pos1)) + list(zip(listneg,neg1)) -print(3) -#vocabulary = set(chain(*[(set(word_tokenize(i[0]))-set(stopwords.words('thai'))) for i in training_data])) -#vocabulary = set(chain(*[x for x in a if x not in [list(set(word_tokenize(i[0]))) for i in training_data]])) -vocabulary = set(chain(*[word_tokenize(i[0]) for i in training_data])) -#print(vocabulary) -print(3.1) -feature_set = [({i:(i in word_tokenize(sentence)) for i in vocabulary},tag) for sentence, tag in training_data] -#print(feature_set) -print(4) -classifier = nbc.train(feature_set) -print(5) -with open('vocabulary.data', 'wb') as out_strm: - dill.dump(vocabulary,out_strm) -out_strm.close() -with open('sentiment.data', 'wb') as out_strm: - dill.dump(classifier,out_strm) -out_strm.close() -print('OK') \ No newline at end of file diff --git a/pythainlp/sentiment/.ipynb_checkpoints/ulmfit_sent-checkpoint.py b/pythainlp/sentiment/.ipynb_checkpoints/ulmfit_sent-checkpoint.py deleted file mode 100644 index aacd9e844..000000000 --- a/pythainlp/sentiment/.ipynb_checkpoints/ulmfit_sent-checkpoint.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- -''' -Code by https://github.com/cstorm125/thai2vec/tree/master/notebook -''' -from __future__ import absolute_import,unicode_literals -import os -import sys -from collections import defaultdict - -#numpy and dill -try: - import numpy as np - import dill as pickle -except ImportError: - import pip - pip.main(['install','numpy','dill']) - try: - import numpy as np - import dill as pickle - except ImportError: - print("Error installing using 'pip install numpy dill'") - sys.exit(0) - -#import torch -try: - import torch -except ImportError: - print('PyTorch required. See https://pytorch.org/.') -import torch -from torch.autograd import Variable -from torch import LongTensor - -#import fastai for multiBatchRNN -try: - from fastai.text import * -except ImportError: - print( - """ - fastai required for multiBatchRNN. - Run 'pip install https://github.com/fastai/fastai/archive/master.zip' - """) - -from pythainlp.tokenize import word_tokenize -from pythainlp.corpus import get_file -from pythainlp.corpus import download - -MODEL_NAME = 'sent_model' -ITOS_NAME = 'itos_sent' - -#download pretrained model -def get_path(fname): - path = get_file(fname) - if path==None: - download(fname) - path = get_file(fname) - return(path) - -#load model -m = torch.load(get_path(MODEL_NAME)) -m.eval() -#load itos and stoi -itos = pickle.load(open(get_path(ITOS_NAME),'rb')) -stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)}) - - -#get sentiment; 1 for positive and 0 for negative -#or score if specified return_score=True -softmax = lambda x : np.exp(x)/np.sum(np.exp(x)) -def get_sentiment(ss,return_score=False): - s = word_tokenize(ss) - t = LongTensor([stoi[i] for i in s]).view(-1,1).cpu() - t = Variable(t,volatile=False) - m.reset() - pred,*_ = m(t) - result = pred.data.cpu().numpy().reshape(-1) - if return_score: - return(softmax(result)) - else: - return(np.argmax(result)) - -def about(): - return ''' - Sentiment Analyzer based on thai2vec - Data is from various online reviews including but not limited to JagerV3 and Wongnai Challenge. - 89% accuracy based on 15% validation set compared to 72% of fastText and 52% most-frequent-class baseline. - - Development : Charin Polpanumas - GitHub : https://github.com/cstorm125/thai2vec - ''' \ No newline at end of file diff --git a/pythainlp/tokenize/.ipynb_checkpoints/__init__-checkpoint.py b/pythainlp/tokenize/.ipynb_checkpoints/__init__-checkpoint.py deleted file mode 100644 index c3d3f3731..000000000 --- a/pythainlp/tokenize/.ipynb_checkpoints/__init__-checkpoint.py +++ /dev/null @@ -1,227 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals -import nltk -import re -import codecs -from six.moves import zip -from pythainlp.corpus.thaisyllable import get_data -from pythainlp.corpus.thaiword import get_data as get_dict -from marisa_trie import Trie - -DEFAULT_DICT_TRIE = Trie(get_dict()) - -def dict_word_tokenize(text, custom_dict_trie, engine='newmm'): - ''' - dict_word_tokenize(text,file,engine) - เป็นคำสั่งสำหรับตัดคำโดยใช้ข้อมูลที่ผู้ใช้กำหนด - text คือ ข้อความที่ต้องการตัดคำ - custom_dict_trie คือ trie ที่สร้างจาก create_custom_dict_trie - engine คือ เครื่องมือตัดคำ - - newmm ตัดคำด้วย newmm - - wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ - - mm ตัดคำด้วย mm - - longest-matching ตัดคำโดยใช้ longest matching - ''' - - if engine=="newmm": - from .newmm import mmcut as segment - elif engine=="mm": - from .mm import segment - elif engine=='longest-matching': - from .longest import segment - elif engine=='wordcutpy': - from .wordcutpy import segment - return segment(text, custom_dict_trie.keys()) - - return segment(text, custom_dict_trie) - -def word_tokenize(text, engine='newmm',whitespaces=True): - """ - ระบบตัดคำภาษาไทย - - word_tokenize(text,engine='newmm') - text คือ ข้อความในรูปแบบ str - engine มี - - newmm - ใช้ Maximum Matching algorithm ในการตัดคำภาษาไทย โค้ดชุดใหม่ (ค่าเริ่มต้น) - - icu - engine ตัวดั้งเดิมของ PyThaiNLP (ความแม่นยำต่ำ) - - dict - ใช้ dicu ในการตัดคำไทย จะคืนค่า False หากไม่สามารถตัดคำไทย - - longest-matching ใช้ Longest matching ในการตัดคำ - - mm ใช้ Maximum Matching algorithm - โค้ดชุดเก่า - - pylexto ใช้ LexTo ในการตัดคำ - - deepcut ใช้ Deep Neural Network ในการตัดคำภาษาไทย - - wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ - """ - - if engine=='icu': - ''' - ตัดคำภาษาไทยโดยใช้ icu ในการตัดคำ - คำเตือน !!! \n คำสั่ง word_tokenize(text) ใน PyThaiNLP 1.6 - ค่าเริ่มต้นจะเปลี่ยนจาก icu ไปเป็น newmm''' - from .pyicu import segment - elif engine=='dict': - ''' - ใช้ dicu ในการตัดคำไทย - จะคืนค่า False หากไม่สามารถตัดคำไทย - ''' - from .dictsegment import segment - elif engine=='mm': - ''' - ใช้ Maximum Matching algorithm - โค้ดชุดเก่า - ''' - from .mm import segment - elif engine=='newmm': - ''' - ใช้ Maximum Matching algorithm ในการตัดคำภาษาไทย โค้ดชุดใหม่ - ''' - from .newmm import mmcut as segment - elif engine=='longest-matching': - ''' - ใช้ Longest matching ในการตัดคำ - ''' - from .longest import segment - elif engine=='pylexto': - ''' - ใช้ LexTo ในการตัดคำ - ''' - from .pylexto import segment - elif engine=='deepcut': - ''' - ใช้ Deep Neural Network ในการตัดคำภาษาไทย - ''' - from .deepcut import segment - elif engine=='cutkum': - ''' - ใช้ Deep Neural Network ในการตัดคำภาษาไทย (https://github.com/pucktada/cutkum) - ''' - from .cutkum import segment - elif engine=='wordcutpy': - ''' - wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ - ''' - from .wordcutpy import segment - else: - raise Exception("error no have engine.") - if whitespaces==False: - return [i.strip(' ') for i in segment(text) if i.strip(' ')!=''] - return segment(text) - -def sent_tokenize(text,engine='whitespace+newline'): - ''' - sent_tokenize(text,engine='whitespace+newline') - ตัดประโยคเบื้องต้น โดยการแบ่งด้วยช่องว่าง - ''' - if engine=='whitespace': - data=nltk.tokenize.WhitespaceTokenizer().tokenize(text) - elif engine=='whitespace+newline': - data=re.sub(r'\n+|\s+','|',text,re.U).split('|') - return data -def wordpunct_tokenize(text): - ''' - wordpunct_tokenize(text) - It is nltk.tokenize.wordpunct_tokenize(text). - ''' - return nltk.tokenize.wordpunct_tokenize(text) -def WhitespaceTokenizer(text): - return nltk.tokenize.WhitespaceTokenizer().tokenize(text) -def isthai(text,check_all=False): - """ - สำหรับเช็คว่าเป็นตัวอักษรภาษาไทยหรือไม่ - isthai(text,check_all=False) - text คือ ข้อความหรือ list ตัวอักษร - check_all สำหรับส่งคืนค่า True หรือ False เช็คทุกตัวอักษร - - การส่งคืนค่า - {'thai':% อักษรภาษาไทย,'check_all':tuple โดยจะเป็น (ตัวอักษร,True หรือ False)} - """ - listext=list(text) - i=0 - num_isthai=0 - if check_all==True: - listthai=[] - while i= 3584 and cVal <= 3711): - num_isthai+=1 - if check_all==True: - listthai.append(True) - else: - if check_all==True: - listthai.append(False) - i+=1 - thai=(num_isthai/len(listext))*100 - if check_all==True: - dictthai=tuple(zip(listext,listthai)) - data= {'thai':thai,'check_all':dictthai} - else: - data= {'thai':thai} - return data -def syllable_tokenize(text1): - """ - syllable_tokenize(text) - เป็นคำสั่งสำหรับใช้ตัดพยางค์ในภาษาไทย - รับ str - ส่งออก list - """ - text1=word_tokenize(text1) - data=[] - trie = create_custom_dict_trie(custom_dict_source=get_data()) - if(len(text1)>0): - i=0 - while(i', re.IGNORECASE) - self.re_rep = re.compile(r'(\S)(\1{3,})') - - def sub_br(self,x): return self.re_br.sub("\n", x) - - def tokenize(self,x): - return [t for t in word_tokenize(self.sub_br(x),engine=self.engine)] - - #replace aaaaaaaa - @staticmethod - def replace_rep(m): - TK_REP = 'tkrep' - c,cc = m.groups() - return f'{TK_REP}{len(cc)+1}{c}' - - def proc_text(self, s): - s = self.re_rep.sub(ThaiTokenizer.replace_rep, s) - s = re.sub(r'([/#])', r' \1 ', s) - #remvoe double space - s = re.sub(' {2,}', ' ', s) - return self.tokenize(s) - - @staticmethod - def proc_all(ss): - tok = ThaiTokenizer() - return [tok.proc_text(s) for s in ss] - - @staticmethod - def proc_all_mp(ss): - ncpus = num_cpus()//2 - with ProcessPoolExecutor(ncpus) as e: - return sum(e.map(ThaiTokenizer.proc_all, ss), []) - -#get tokenized texts -BOS = 'xbos' # beginning-of-sentence tag -def get_texts(df): - labels = df.iloc[:,0].values.astype(np.int64) - texts = BOS+df.iloc[:,1].astype(str).apply(lambda x: x.rstrip()) - tok = ThaiTokenizer().proc_all_mp(partition_by_cores(texts)) - return tok, list(labels) - -#get all tokenized texts -def get_all(df): - tok, labels = [], [] - for i, r in enumerate(df): - tok_, labels_ = get_texts(r) - tok += tok_; - labels += labels_ - return tok, labels - -#convert text dataframe to numericalized dataframes -def numericalizer(df, itos=None, max_vocab = 60000, min_freq = 2, pad_tok = '_pad_', unk_tok = '_unk_'): - tok, labels = get_all(df) - freq = Counter(p for o in tok for p in o) - if itos is None: - itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq] - itos.insert(0, pad_tok) - itos.insert(0, unk_tok) - stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)}) - lm = np.array([[stoi[o] for o in p] for p in tok]) - return(lm,tok,labels,itos,stoi,freq) - -#get document vectors from language model -def document_vector(ss, m, stoi,tok_engine='newmm'): - s = word_tokenize(ss) - t = LongTensor([stoi[i] for i in s]).view(-1,1).cuda() - t = Variable(t,volatile=False) - m.reset() - pred,*_ = m[0](t) - #get average of last lstm layer along bptt - res = to_np(torch.mean(pred[-1],0).view(-1)) - return(res) - -#merge pretrained embeddings with current embeddings -def merge_wgts(em_sz, wgts, itos_pre, itos_cls): - vocab_size = len(itos_cls) - enc_wgts = to_np(wgts['0.encoder.weight']) - #average weight of encoding - row_m = enc_wgts.mean(0) - stoi_pre = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos_pre)}) - #new embedding based on classification dataset - new_w = np.zeros((vocab_size, em_sz), dtype=np.float32) - for i,w in enumerate(itos_cls): - r = stoi_pre[w] - #use pretrianed embedding if present; else use the average - new_w[i] = enc_wgts[r] if r>=0 else row_m - wgts['0.encoder.weight'] = T(new_w) - wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w)) - wgts['1.decoder.weight'] = T(np.copy(new_w)) - return(wgts) - -#feature extractor -class SaveFeatures(): - features=None - def __init__(self, m): self.hook = m.register_forward_hook(self.hook_fn) - def hook_fn(self, module, input, output): self.features = output - def remove(self): self.hook.remove() - -#download pretrained language model -def get_path(fname): - path = get_file(fname) - if path==None: - download(fname) - path = get_file(fname) - return(path) - -def load_pretrained_model(): - path = get_path(MODEL_NAME) - wgts = torch.load(path, map_location=lambda storage, loc: storage) - return(wgts) - -def load_pretrained_itos(): - path = get_path(ITOS_NAME) - itos = pickle.load(open(path,'rb')) - return(itos) - -def about(): - return ''' - thai2vec - State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language. - Created as part of pyThaiNLP with ULMFit implementation from fast.ai - - Development : Charin Polpanumas - GitHub : https://github.com/cstorm125/thai2vec - ''' - - diff --git a/pythainlp/word_vector/.ipynb_checkpoints/__init__-checkpoint b/pythainlp/word_vector/.ipynb_checkpoints/__init__-checkpoint deleted file mode 100644 index 22e30a360..000000000 --- a/pythainlp/word_vector/.ipynb_checkpoints/__init__-checkpoint +++ /dev/null @@ -1,3 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals -from .thai2vec import * \ No newline at end of file diff --git a/pythainlp/word_vector/.ipynb_checkpoints/__init__-checkpoint.py b/pythainlp/word_vector/.ipynb_checkpoints/__init__-checkpoint.py deleted file mode 100644 index d61c3dfd8..000000000 --- a/pythainlp/word_vector/.ipynb_checkpoints/__init__-checkpoint.py +++ /dev/null @@ -1,2 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals \ No newline at end of file diff --git a/pythainlp/word_vector/.ipynb_checkpoints/thai2vec-checkpoint.py b/pythainlp/word_vector/.ipynb_checkpoints/thai2vec-checkpoint.py deleted file mode 100644 index 264228d70..000000000 --- a/pythainlp/word_vector/.ipynb_checkpoints/thai2vec-checkpoint.py +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding: utf-8 -*- -''' -Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb -''' -from __future__ import absolute_import,unicode_literals -import six -import sys -if six.PY2: - print("Thai sentiment in pythainlp. Not support python 2.7") - sys.exit(0) -try: - from gensim.models import KeyedVectors - import numpy as np -except ImportError: - import pip - pip.main(['install','gensim','numpy']) - try: - from gensim.models import KeyedVectors - import numpy as np - except ImportError: - print("Error ! using 'pip install gensim numpy'") - sys.exit(0) -from pythainlp.tokenize import word_tokenize -from pythainlp.corpus import get_file -from pythainlp.corpus import download as download_data -import os - -def download(): - path = get_file('thai2vec02') - if path==None: - download_data('thai2vec02') - path = get_file('thai2vec02') - return path -def get_model(): - return KeyedVectors.load_word2vec_format(download(),binary=False) -def most_similar_cosmul(positive,negative): - ''' - การใช้งาน - input list - ''' - return get_model().most_similar_cosmul(positive=positive, negative=negative) -def doesnt_match(listdata): - return get_model().doesnt_match(listdata) -def similarity(word1,word2): - return get_model().similarity(word1,word2) -def sentence_vectorizer(ss,dim=300,use_mean=False): - s = word_tokenize(ss) - vec = np.zeros((1,dim)) - for word in s: - if word in get_model().wv.index2word: - vec+= get_model().wv.word_vec(word) - else: pass - if use_mean: vec /= len(s) - return(vec) - -def about(): - return ''' - thai2vec - State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language. - Created as part of pyThaiNLP with ULMFit implementation from fast.ai - - Development : Charin Polpanumas - GitHub : https://github.com/cstorm125/thai2vec - '''