類似文章の比較するために作ったバギースクリプト。 mecabを使用。 mecab_test.py # -*- coding: cp932 -*- from ctypes import * class Mecab_Test: def __init__(self): self.lib=cdll.LoadLibrary("C:/Program Files/MeCab/bin/libmecab.dll") getattr(self.lib, "<b style="color:black;background-color:#ffff66">mecab_new2</b>") self.lib.<b style="color:black;background-color:#ffff66">mecab_new2</b>.restype = c_void_p self.lib.<b style="color:black;background-color:#ffff66">mecab_new2</b>.argtypes = [c_char_p, c_char_p] getattr(self.lib, "mecab_sparse_tostr") self.lib.mecab_sparse_tostr.restype = c_char_p self.m = self.lib.<b style="color:black;background-color:#ffff66">mecab_new2</b>('-a', 'c://Progmecab/bin/libmecab.dll') def conv_mecab(self,str): return self.lib.mecab_sparse_tostr(self.m,str) if __name__=="__main__": str='今日は元気だ。' m=Mecab_Test() print m.conv_mecab(str) str='明日は雨だろう。' print m.conv_mecab(str); test.py # -*- coding: cp932 -*- import re,string,mecab_test filename='sample6.txt' inputfile=open(filename,'r') lines=inputfile.readlines(); inputfile.close() m_test=mecab_test.Mecab_Test() class CharSet: def __init__(self): self.first_array=[] self.second_array=[] def drawArray(self): for line in self.second_array: print line cs=CharSet() def analizer_str(strs): str_array=string.split(strs,'\n') for basestr in str_array: if(basestr=='EOS'): cs.second_array.append(cs.first_array) cs.first_array=[] break (b1,b2)=string.split(basestr,'\t') if re.match(r'名詞',b2): b2=string.split(b2,',') if re.match('一般',b2[1]): cs.first_array.append(b1) # print b1 # if re.search(r'一般',b2): # print 'str1 %s youso %s' % (b1,b2) for line in lines: line=string.strip(line) strs=string.split(line,'。') for str in strs: str=str+'。' if str == '。':break analizer_str(m_test.conv_mecab(str)) cs.drawArray() |