importos, re,nltk
fromnltk.corpusimportwords, state_union,brown,treebank
fromcollectionsimportdefaultdict
列表與元組
# words = ['I', 'turned', 'off', 'the', 'spectroroute','the']
# words2=('I', 'turned', 'off', 'the', 'spectroroute','the','I')
# print (set(words))
# #print(reversed(words))
# print(sorted(words))
# print (set(words2))
# print(reversed(words2))
# print(sorted(words2))
#NOUN 名詞
# brown_news_tagged=brown.tagged_words(categories='news',tagset='universal')
# word_tag_pairs=nltk.bigrams(brown_news_tagged)
# noun_proceders = [a[1]for(a,b)in word_tag_pairs if b[1]=='NOUN']
# fdist=nltk.FreqDist(noun_proceders)
# common_proceders=[tag for (tag,value) in fdist.most_common()]
# print(common_proceders) 獲取名詞前置的高頻詞類
#Verb 動(dòng)詞
獲得過去分詞以及過去式詞形相同的動(dòng)詞
# wsj=treebank.tagged_words()
# cfd1=nltk.ConditionalFreqDist(wsj)
# vl=[w for w in cfd1.conditions()if 'VBN' in cfd1[w] and 'VBD' in cfd1[w]]
# print(vl)
獲取某過去分詞詞以及其tag的位置
# cfd2=nltk.ConditionalFreqDist((tag,word)for (word,tag)in wsj)
# vbn_list=list(cfd2['VBN'])
# idx1=wsj.index(('kicked','VBN'))
# print(idx1)
獲取其前置詞
# for v in vbn_list:
#? ? idx=wsj.index((v, 'VBN'))
#? ? print (wsj[idx-1:idx])
等同于:
#print([wsj[wsj.index((v, 'VBN'))-1:wsj.index((v, 'VBN'))] for v in vbn_list])
#Ajectives and Adverbs 形容詞和副詞
詞典反置是常用方法
# def findtags(tag_prefix, tagges_text):
#? ? cfd=nltk.ConditionalFreqDist((tag,word) for (word,tag) in tagges_text
#? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? if tag.startswith(tag_prefix))
#? ? return dict((tag, cfd[tag].most_common(5) for tag in cfd.conditions()))
#exploring tagged? corpora 探索標(biāo)注的數(shù)據(jù)庫
# brwon_learnd_tagged=brown.tagged_words(categories='learned', tagset='universal')
# tags=[b[1]for(a,b)in nltk.bigrams(brwon_learnd_tagged)if a[0]=='often']
# #print(tags)
# fd=nltk.FreqDist(tags)
# print(fd.tabulate())
# brwon_learnd_tagged=brown.tagged_words(categories='news', tagset='universal')
# cfd=nltk.ConditionalFreqDist((word.lower(),tag)
#? ? ? ? ? ? ? ? ? ? ? ? ? ? for (word,tag) in brwon_learnd_tagged)
# for word in sorted(cfd.conditions()):
#? ? if len(cfd[word])>3:
#? ? ? ? tags=[tag for (tag, _) in cfd[word].most_common()]
#? ? ? ? #print(cfd[word])
#? ? ? ? print(word, tags)
#dictionary 詞典:默認(rèn)詞典
# news_words = brown.words(categories='news')
# fd=nltk.FreqDist(news_words)
# v1000=[word for (word, _) in fd.most_common(1000)]
# mapping=defaultdict(lambda: 'UNK')
# for word in v1000:
#? ? mapping[word]=word
# new_word=[mapping[word] for word in news_words]
# print(new_word[:20])
# incrementally updating a Dictionary 詞典內(nèi)容遞增
# words = words.words('en')
# last_letters=defaultdict(list)
# for word in words:
#? ? key=word[-2:] 發(fā)現(xiàn)有該類鍵寒亥,就將其名稱以及值添加到字典中
#? ? last_letters[key].append(word)
# print(last_letters['zy'][:10])
#
# anagrams=defaultdict(list) 找出有特定字母組成的所有的詞
# for word in words:
#? ? key=''.join(sorted(word))
#? ? anagrams[key].append(word)
Nltk提供的簡(jiǎn)單方法
# anagrams=nltk.Index((''.join(sorted(w)),w)for w in words)
# print(anagrams['abc'])
#invert a dictionary 反置詞典 便于查找
# pos={'cats':'N','name':'N','old':'ADJ','young':'ADJ','run':'V', 'sing':'V'}
# #pos2=dict((value,key)for (key,value)in pos.items())
# pos2=nltk.Index((value,key)for (key,value)in pos.items())
# print(pos2['N'])
#Automatic Tagging 自動(dòng)標(biāo)注: 用100個(gè)高頻詞匯的高頻tag做tagger
#The Lookup Tagger 查找tagger
# brown_tagged_sents=brown.tagged_sents(categories='news')
# fd=nltk.FreqDist(brown.words(categories='news'))
# cfd=nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
# most_freq_words=fd.most_common(100)
# likely_tags=dict((word, cfd[word].max())for (word,_)in most_freq_words)
# baseline_tagger=nltk.UnigramTagger(model=likely_tags)
# print(cfd['news'].max())
# print(cfd['news'].tabulate())
# print(baseline_tagger.evaluate(brown_tagged_sents))
#N-Gram Tagging 多級(jí)標(biāo)注
brown_tagged_sents=brown.tagged_sents(categories='news')
brown_sents=brown.sents(categories='news')
size=int(len(brown_tagged_sents)*0.9)
train_sents=brown_tagged_sents[:size]? 將數(shù)據(jù)拆分
#print(train_sents[3])
test_sents=brown_tagged_sents[size:]
#
unigram_tagger=nltk.UnigramTagger(train_sents)
print(unigram_tagger.size())
#print(unigram_tagger.tag(brown_sents[3]))
#
# print(bigram_tagger.evaluate(test_sents))
#combination
# t0=nltk.DefaultTagger('NN')
# t1=nltk.UnigramTagger(train_sents, backoff=t0)
# t2=nltk.BigramTagger(train_sents, cutoff=2, backoff=t1)
#print(t2.evaluate(test_sents))
# test_tags = [tag for sent in brown.sents(categories='editorial')
#? ? ? ? ? ? ? ? ? for (word, tag) in t2.tag(sent)]
# gold_tags = [tag for (word, tag) in brown.tagged_words(categories='editorial')]
# print(nltk.ConfusionMatrix(gold_tags, test_tags))
# cfd=nltk.ConditionalFreqDist(
#? ? ? ? ? ? ? ? ? ? ? ? ? ? ((x[1],y[0]),y[1])
#? ? ? ? ? ? ? ? ? ? ? ? ? ? for sent in brown_tagged_sents
#? ? ? ? ? ? ? ? ? ? ? ? ? ? for x,y in nltk.bigrams(sent))
#
# ambigous_context=[c for c in cfd.conditions() if len(cfd[c])>1]
# print(sum(cfd[c].N()for c in ambigous_context)/cfd.N())