mrzzcn · pull · Jan 17, 2019
diff --git a/tools/IME/Baidu.py b/tools/IME/Baidu.py
@@ -61,6 +61,7 @@ def _get_word(self, data, length = 0, pure_english = False):
         else:
             word.pinyin = ' '.join(pinyin)
             hanzi = byte2str(data[pos:pos+ length*2])
+            hanzi = hanzi.strip('\x00')
             pos = pos+ length*2
             word.value = hanzi.encode('utf-8')
         return word
@@ -101,9 +102,16 @@ def read(self, data):
                 pos = pos + length * 4
             if word.value:
                 if self.dictionary.has_key(word.pinyin):
-                    self.dictionary[word.pinyin].append(word)
+                    # 校验拼音和词汇的长度是否相等，如不相等则丢弃
+                    if len(word.value.decode('utf-8')) == len(word.pinyin.split(' ')):
+                        self.dictionary[word.pinyin].append(word)
+        #                print(word.value.decode('utf-8'), len(word.value.decode('utf-8')), word.pinyin,len(word.pinyin.split(' ')))
+                    #if word.value.decocde('utf-8') == u'\u7fbd\u5b50':
+#                        print(word.dump())
                 else:
-                    self.dictionary[word.pinyin] = []
-                    self.dictionary[word.pinyin].append(word)
+                    if len(word.value.decode('utf-8')) == len(word.pinyin.split(' ')):
+                        self.dictionary[word.pinyin] = []
+                        self.dictionary[word.pinyin].append(word)
+#                        print(word.dump())
 
         return self.dictionary
diff --git a/tools/IME/tools.py b/tools/IME/tools.py
@@ -88,6 +88,13 @@ def __init__(self, value='', encoding='utf-8', count=0):
         self.encoding = encoding
     def __repr__(self):
         return self.value
+    def dump(self):
+        return {'value' : self.value,
+                'count' : self.count,
+                'pinyin' : self.pinyin,
+                'encoding' : self.encoding
+                }
+
 
 class WordDict(dict):
     def _opencc(self, string):
@@ -116,8 +123,23 @@ def zhs_to_zht(self):
         for line in tmp_string.split('\n'):
             key, words = line.split(':', 1)
             words = words.split('\t')
+
             for i in xrange(len(self[key])):
-                self[key][i].value = words[i]
+                    self[key][i].value = words[i]
+#            print(self[key], len(self[key]), words)
+#            for i in xrange(len(self[key])):
+#                if len(self[key][i].value.decode('utf-8')) == len(words[i].decode('utf-8')):
+#                    self[key][i].value = words[i]
+#                else:
+#                    for j in xrange(len(words)):
+#                        #print(self[key][i].value.decode('utf-8'), words[j].decode('utf-8'))
+#                        print(self[key][i].value)
+#                        print(words[j])
+#                        #if self[key][i].value.decode('utf-8') == words[j].decode('utf-8'):
+#                        if len(self[key][i].value) == len(words[j].decode('utf-8')):
+#                            print(self[key])
+#                            self[key][i].value = words[j]
+
 #            self[key] = map(self._opencc, self[key])
 #            for i in xrange(len(self[key])):
 #                self[key][i].value = self._opencc(self[key][i].value)