一 得到原始文本内容
派生到我的代码片
1 2 3 4 5 |
def FileRead(self,filePath): f = open(filePath) raw=f.read() return raw |
二 分割成句子
1 2 3 4 5 |
def SenToken(self,raw):#分割成句子 sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') sents = sent_tokenizer.tokenize(raw) return sents |
三 句子内容的清理,去掉数字标点和非字母字符
1 2 3 4 5 6 7 |
def CleanLines(self,line): identify = string.maketrans('', '') delEStr = string.punctuation +string.digits #ASCII 标点符号,数字 cleanLine = line.translate(identify,delEStr) #去掉ASCII 标点符号和空格 cleanLine =line.translate(identify,delEStr) #去掉ASCII 标点符号 return cleanLine |
四 nltk.pos_tag进行词性标注
1 2 3 4 |
def POSTagger(self,sent): taggedLine=[nltk.pos_tag(sent) for sent in sents] return taggedLine |
五 nltk.word_tokenize分词
1 2 3 4 5 |
def WordTokener(self,sent):#将单句字符串分割成词 result='' wordsInStr = nltk.word_tokenize(sent) return wordsInStr |
六 enchant拼写检查
派生到我的代码片
1 2 3 4 5 6 7 8 9 10 |
def WordCheck(self,words):#拼写检查 d = enchant.Dict("en_US") checkedWords=() for word in words: if not d.check(word): d.suggest(word) word=raw_input() checkedWords = (checkedWords,'05') return checkedWords |
七 去停用词和小写去短词
1 2 3 4 5 6 7 |
def CleanWords(self,wordsInStr):#去掉标点符号,长度小于3的词以及non-alpha词,小写化 cleanWords=[] stopwords = {}.fromkeys([ line.rstrip()for line in open(conf.PreConfig.ENSTOPWORDS)]) for words in wordsInStr: cleanWords+= [[w.lower() for w in words if w.lower() not in stopwords and 3<=len(w)]] return cleanWords |
八 使用Wordnet进行词干化
1 2 3 4 5 6 7 8 |
def StemWords(self,cleanWordsList): stemWords=[] # porter = nltk.PorterStemmer()#有博士说这个词干化工具效果不好,不是很专业 # result=[porter.stem(t) for t incleanTokens] for words in cleanWordsList: stemWords+=[[wn.morphy(w) for w in words]] return stemWords |
九 完整代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
#coding=utf-8 ''''' Created on 2014-3-20 英文的词干化和去停用词 @author: liTC ''' import nltk # import enchant import string import re import os from config import Config as conf from nltk.corpus import wordnet as wn import sys reload(sys) sys.setdefaultencoding('utf-8') class EnPreprocess: '''''整体流程: 读取文件:FileRead()filepath to raw 分割成句子:SenToken()raw to sents (词性标注):POSTagger()sent to words[] 句子分割成词:TokenToWords()将句子分割成词 sent to word[] (拼写检查):WordCheck() 错误的去掉或是等人工改正 去掉标点,去掉非字母内容:CleanLines()句子,line to cleanLine 去掉长度小于3的词,小写转换,去停用词:CleanWords(),words[] to cleanWords[] 词干化:StemWords()把词词干化返回,words to stemWords 二次清理:再执行一次CleanWords(),使句子更加纯净 ''' def__init__(self): print'English token and stopwords remove...' defFileRead(self,filePath):#读取内容 f =open(filePath) raw=f.read() return raw defWriteResult(self,result,resultPath): self.mkdir(str(resultPath).replace(str(resultPath).split('/')[-1],'')) f=open(resultPath,"w") #将结果保存到另一个文档中 f.write(str(result)) f.close() defSenToken(self,raw):#分割成句子 sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') sents =sent_tokenizer.tokenize(raw) return sents def POSTagger(self,sent): taggedLine=[nltk.pos_tag(sent) for sent in sents] returntaggedLine defWordTokener(self,sent):#将单句字符串分割成词 result='' wordsInStr= nltk.word_tokenize(sent) returnwordsInStr defWordCheck(self,words):#拼写检查 d =enchant.Dict("en_US") checkedWords=() for word inwords: if notd.check(word): d.suggest(word) word=raw_input() checkedWords = (checkedWords,'05') returncheckedWords defCleanLines(self,line): identify =string.maketrans('', '') delEStr =string.punctuation + string.digits #ASCII 标点符号,数字 # cleanLine= line.translate(identify, delEStr) #去掉ASCII 标点符号和空格 cleanLine =line.translate(identify,delEStr) #去掉ASCII 标点符号 returncleanLine defCleanWords(self,wordsInStr):#去掉标点符号,长度小于3的词以及non-alpha词,小写化 cleanWords=[] stopwords ={}.fromkeys([ line.rstrip() for line in open(conf.PreConfig.ENSTOPWORDS)]) for wordsin wordsInStr: cleanWords+= [[w.lower() for w in words if w.lower() not in stopwordsand 3<=len(w)]] returncleanWords defStemWords(self,cleanWordsList): stemWords=[] # porter =nltk.PorterStemmer()#有博士说这个词干化工具效果不好,不是很专业 # result=[porter.stem(t) for t in cleanTokens] for wordsin cleanWordsList: stemWords+=[[wn.morphy(w) for w in words]] returnstemWords defWordsToStr(self,stemWords): strLine=[] for wordsin stemWords: strLine+=[w for w in words] returnstrLine defmkdir(self,path): # 去除首位空格 path=path.strip() # 去除尾部 \ 符号 path=path.rstrip("\\") # 判断路径是否存在 # 存在 True # 不存在 False isExists=os.path.exists(path) # 判断结果 if notisExists: # 如果不存在则创建目录 printpath+' 创建成功' # 创建目录操作函数 os.makedirs(path) returnTrue else: # 如果目录存在则不创建,并提示目录已存在 printpath+' 目录已存在' returnFalse defEnPreMain(self,dir): forroot,dirs,files in os.walk(dir): foreachfiles in files: croupPath=os.path.join(root,eachfiles) printcroupPath resultPath=conf.PreConfig.NLTKRESULTPATH+croupPath.split('/')[-2]+'/'+croupPath.split('/')[-1] raw=self.FileRead(croupPath).strip() sents=self.SenToken(raw) # taggedLine=self.POSTagger(sents)#暂不启用词性标注 cleanLines=[self.CleanLines(line) for line in sents] words=[self.WordTokener(cl) for cl in cleanLines] # checkedWords=self.WordCheck(words)#暂不启用拼写检查 cleanWords=self.CleanWords(words) stemWords=self.StemWords(cleanWords) # cleanWords=self.CleanWords(stemWords)#第二次清理出现问题,暂不启用 strLine=self.WordsToStr(stemWords) self.WriteResult(strLine,resultPath)#一个文件暂时存成一行 defStandardTokener(self,raw): result='' #还没弄好 returnresult enPre=EnPreprocess() enPre.EnPreMain(conf.PreConfig.ENCORUPPATH) |
PS:一直还没用好Stanford的那个工具包,谁用过教我一下吧
