Nlp-day3
Jump to navigation
Jump to search
### 1
import nltk
print(nltk.corpus.brown.raw()[:100])
nltk.corpus.brown.tagged_words()[:10]
nltk.corpus.gutenberg.tagged_words() # ==> no such method
nltk.corpus.brown.tagged_words(tagset='universal')[:10]
### 2. Exercise
# How often do different parts of speech appear?
# Use brown corpus.
fd = nltk.FreqDist([
word_and_tag[1]
for word_and_tag
in nltk.corpus.brown.tagged_words(tagset='universal')
])
fd.most_common()
### 3. Exercise
# How often do different parts of speech appear in different categories?
# Use brown corpus.
cfd = nltk.ConditionalFreqDist(
(category, tag)
for category in nltk.corpus.brown.categories()
for (word, tag) in nltk.corpus.brown.tagged_words(
categories=category, tagset='universal')
)
cfd.tabulate()
### 4. Exercise
# Find out words which are highly ambiguous as to their part of speech tag. That is, find words that are at least four different parts of speech in different contexts. Use brown corpus.
# For example, close can be:
# - an adjective: Only the families and a dozen close friends will be present.
# - an adverb: Both came close to playing football at the University of Oklahoma.
# - a verb: Position your components before you close them in.
# - a noun: One night, at the close of the evening service, he came forward.
cfd = nltk.ConditionalFreqDist([
(word.lower(), tag)
for (word, tag)
in nltk.corpus.brown.tagged_words(tagset='universal')
])
for word in sorted(cfd.conditions()):
if len(cfd[word]) > 3:
tags = [tag for (tag, _) in cfd[word].most_common()]
print('{:20} {}'.format(word, ' '.join(tags)))
### 5
# ADJ adjective new, good, high, special, big, local
# ADP adposition on, of, at, with, by, into, under
# ADV adverb really, already, still, early, now
# CONJ conjunction and, or, but, if, while, although
# DET determiner or article the, a, some, most, every, no, which
# NOUN noun year, home, costs, time, Africa
# NUM numeral twenty-four, fourth, 1991, 14:24
# PRT particle at, on, out, over per, that, up, with
# PRON pronoun he, their, her, its, my, I, us
# VERB verb is, say, told, given, playing, would
# . punctuation marks . , ; !
# X other ersatz, esprit, dunno, gr8, univeristy
### 6
tagged_words = nltk.corpus.brown.tagged_words(
categories='news', tagset='universal')
tags = [tag for (word, tag) in tagged_words]
fd = nltk.FreqDist(tags)
fd.most_common()
default_tagger = nltk.DefaultTagger('NOUN')
brown_tagged_sents = nltk.corpus.brown.tagged_sents(
categories='news', tagset='universal')
default_tagger.evaluate(brown_tagged_sents)
### 7
patterns = [
('.*ing$', 'VERB'),
('.*ed$', 'VERB'),
('.*es$', 'VERB'), # goes
('.*ould$', 'VERB'), # should
(".*'s$", 'NOUN'),
('.*s$', 'NOUN'),
('^-?\d+(\.\d+)?$', 'NUM'),
('.*', 'NOUN'),
]
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.evaluate(brown_tagged_sents)
### 8
fd = nltk.FreqDist(nltk.corpus.brown.words(categories='news'))
most_freq_words = fd.most_common(100)
most_freq_words
cfd = nltk.ConditionalFreqDist(
nltk.corpus.brown.tagged_words(
categories='news', tagset='universal'))
likely_tags = dict([(word, cfd[word].max())
for (word, _) in most_freq_words])
likely_tags.items()[:10]
unigram_tagger = nltk.UnigramTagger(model=likely_tags)
unigram_tagger.evaluate(brown_tagged_sents)
### 9 (evaluate first 10 elements from a generator)
import itertools
generator = (2*x for x in xrange(100000))
list(itertools.takewhile(
lambda (i, x): i < 10,
enumerate(generator)
))
### 10
unigram_with_backoff = nltk.UnigramTagger(
model=likely_tags,
backoff=default_tagger,
)
unigram_with_backoff.evaluate(brown_tagged_sents)
### 11
fd = nltk.FreqDist(nltk.corpus.brown.words(categories='news'))
most_freq_words = fd.most_common(10000)
cfd = nltk.ConditionalFreqDist(
nltk.corpus.brown.tagged_words(
categories='news', tagset='universal'))
likely_tags = dict([(word, cfd[word].max())
for (word, _) in most_freq_words])
unigram_with_backoff_3 = nltk.UnigramTagger(
model=likely_tags,
backoff=default_tagger,
)
unigram_with_backoff_3.evaluate(brown_tagged_sents)
##### MACHINE LEARNING ----------------------
### 12
# 1. Change format of input
sents = nltk.corpus.treebank_raw.sents()
tokens2 = nltk.corpus.treebank_raw.words()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
tokens.extend(sent)
offset += len(sent)
boundaries.add(offset-1)
# 2. Extract features
def punct_features(tokens, i):
return {
'next-word-capitalized': tokens[i+1][0].isupper(),
'prev-word': tokens[i-1].lower(),
'punct': tokens[i],
'prev-word-is-one-char': len(tokens[i-1]) == 1,
}
featuresets = [
(punct_features(tokens, i), i in boundaries)
for i in range(1, len(tokens)-1)
if tokens[i] in '.?!'
]
featuresets[:2]
# 3. Train classifier
classifier = nltk.NaiveBayesClassifier.train(featuresets)
# 4. Use classifier on a new input
test_tokens = nltk.word_tokenize(
'This is an example sentence'
', mr . Smith. And this is another one.'
)
# 4a. Extract features from the new input
test_featuresets = [
punct_features(test_tokens, i)
for i in range(1, len(test_tokens)-1)
if test_tokens[i] in '.?!'
]
# 4b. Use classifier
classifier.classify_many(test_featuresets)
classifier.show_most_informative_features()
# 5. Evaluate classifier
nltk.classify.accuracy(classifier, featuresets)
### 13
# Split input data into three sets
index1 = int(len(featuresets) * 0.6)
index2 = int(len(featuresets) * 0.8)
train_set = featuresets[:index1]
devtest_set = featuresets[index1:index2]
test_set = featuresets[index2:]
len(train_set), len(devtest_set), len(test_set)
### 14. Exercise
# Male and female names have distinctive characteristics.
# For example, if a name ends with a, e or i,
# it’s likely to be female.
# Build a classifier that looks at the first
# and last letter in a name.
# Evaluate its accuracy.
# Use names corpus.
# Step 1: Change format of input -- Build labeled_names list.
# labeled_names == [('John', 'male'), ('Alice', 'female')]
import random
male_names = nltk.corpus.names.words('male.txt')
female_names = nltk.corpus.names.words('female.txt')
labelled_male_names = [(name, 'male')
for name in male_names]
labelled_female_names = [(name, 'female')
for name in female_names]
labelled_names = labelled_male_names + labelled_female_names
random.shuffle(labelled_names)
# Step 2a: Build feature extractor
# >>> gender_features('Shrek')
# {'first_letter': 'S', 'last_letter': 'k'}
def gender_features(name):
return {
'first_letter': name[0].lower(),
'last_letter': name[-1].lower(),
}
gender_features('Shrek')
# Step 2b: Extract the features
# featuresets == [({'first_letter': 'S', 'last_letter': 'k'}, 'male')]
featuresets = [
(gender_features(name), gender)
for name, gender in labelled_names
]
featuresets[:2]
# Step 2c: Split the input into three sets
index1 = int(len(featuresets) * 0.6)
index2 = int(len(featuresets) * 0.8)
train_set = featuresets[:index1]
devtest_set = featuresets[index1:index2]
test_set = featuresets[index2:]
len(train_set), len(devtest_set), len(test_set)
# Step 3: train the classifier
bayes_classifier = nltk.NaiveBayesClassifier.train(train_set)
# Step 4: test the classifier on example names: 'Neo', 'Maria'
featureset = gender_features('Neo')
bayes_classifier.classify(featureset)
# Step 5: evaluate the classifier on devtest_set
nltk.classify.accuracy(bayes_classifier, devtest_set)
# Step 6: use DecisionTreeClassifier - train it, test it
# on example names and evaluate it
tree_classifier = nltk.DecisionTreeClassifier.train(train_set)
tree_classifier.classify(gender_features('Maria'))
nltk.classify.accuracy(tree_classifier, devtest_set)
nltk.classify.accuracy(tree_classifier, test_set)
####### CHUNKING ------------------------------
### 15
def preprocess(document):
sentences = nltk.sent_tokenize(document) # 2
sentences = [nltk.word_tokenize(sent) # 3
for sent in sentences]
sentences = [nltk.pos_tag(sent, tagset='universal')
for sent in sentences]
return sentences
sentences = preprocess(
'The little yellow dog barked at the cat.')
sentences
grammar = "NP: {<DET>?<ADJ>*<NOUN>}"
cp = nltk.RegexpParser(grammar)
t = cp.parse(sentences[0])
t # display the tree in jupyter notebook
t.draw() # display the tree in a new window
print(t.pformat()) # display the tree in textual format
### 16
sentences = preprocess(
'The market for system-management software for '
'Digital\'s hardware is fragmented enough that '
'a giant such as Computer Associates '
'should do well there.')
result = cp.parse(sentences[0])
result
print(result.pformat())
for subtree in result.subtrees():
print subtree.label(), subtree
### 17. Exercise
# First, find all sequences of VERB PRT VERB
# where PRT stands for any particle (to, in).
# Use gutenberg corpus
#
# Expected output:
# (CHUNK seemed/VERB to/PRT unite/VERB)
# (CHUNK ceased/VERB to/PRT hold/VERB)
# (CHUNK left/VERB to/PRT dine/VERB)
sentences = preprocess(nltk.corpus.gutenberg.raw()[:10000])
grammar = 'CHUNK: {<VERB> <PRT> <VERB>}'
cp = nltk.RegexpParser(grammar)
t = cp.parse(sentences[0])
print(t.pformat())
# Solution:
sentences = preprocess(nltk.corpus.gutenberg.raw()[:10000])
grammar = 'CHUNK: {<VERB> <PRT> <VERB>}'
cp = nltk.RegexpParser(grammar)
for sent in sentences:
t = cp.parse(sent)
for subtree in t.subtrees():
if subtree.label() == 'CHUNK':
print subtree