Nlp-day3

From Training Material
Jump to navigation Jump to search
### 1

import nltk
print(nltk.corpus.brown.raw()[:100])
nltk.corpus.brown.tagged_words()[:10]
nltk.corpus.gutenberg.tagged_words()  # ==> no such method
nltk.corpus.brown.tagged_words(tagset='universal')[:10]

### 2. Exercise

# How often do different parts of speech appear?
# Use brown corpus.

fd = nltk.FreqDist([
    word_and_tag[1]
    for word_and_tag
    in nltk.corpus.brown.tagged_words(tagset='universal')
])
fd.most_common()

### 3. Exercise

# How often do different parts of speech appear in different categories?
# Use brown corpus.

cfd = nltk.ConditionalFreqDist(
    (category, tag)
    for category in nltk.corpus.brown.categories()
    for (word, tag) in nltk.corpus.brown.tagged_words(
        categories=category, tagset='universal')
)
cfd.tabulate()

### 4. Exercise

# Find out words which are highly ambiguous as to their part of speech tag. That is, find words that are at least four different parts of speech in different contexts. Use brown corpus.

# For example, close can be:

# - an adjective: Only the families and a dozen close friends will be present.
# - an adverb: Both came close to playing football at the University of Oklahoma.
# - a verb: Position your components before you close them in.
# - a noun: One night, at the close of the evening service, he came forward.

cfd = nltk.ConditionalFreqDist([
    (word.lower(), tag)
    for (word, tag) 
    in nltk.corpus.brown.tagged_words(tagset='universal')
])

for word in sorted(cfd.conditions()):
    if len(cfd[word]) > 3:
        tags = [tag for (tag, _) in cfd[word].most_common()]
        print('{:20} {}'.format(word, ' '.join(tags)))

### 5

# ADJ	adjective	new, good, high, special, big, local
# ADP	adposition	on, of, at, with, by, into, under
# ADV	adverb	really, already, still, early, now
# CONJ	conjunction	and, or, but, if, while, although
# DET	determiner or article	the, a, some, most, every, no, which
# NOUN	noun	year, home, costs, time, Africa
# NUM	numeral	twenty-four, fourth, 1991, 14:24
# PRT	particle	at, on, out, over per, that, up, with
# PRON	pronoun	he, their, her, its, my, I, us
# VERB	verb	is, say, told, given, playing, would
# .	punctuation marks	. , ; !
# X	other	ersatz, esprit, dunno, gr8, univeristy

### 6

tagged_words = nltk.corpus.brown.tagged_words(
    categories='news', tagset='universal')
tags = [tag for (word, tag) in tagged_words]
fd = nltk.FreqDist(tags)
fd.most_common()

default_tagger = nltk.DefaultTagger('NOUN')

brown_tagged_sents = nltk.corpus.brown.tagged_sents(
    categories='news', tagset='universal')
default_tagger.evaluate(brown_tagged_sents)

### 7

patterns = [
    ('.*ing$', 'VERB'),
    ('.*ed$', 'VERB'),
    ('.*es$', 'VERB'),  # goes
    ('.*ould$', 'VERB'),  # should
    (".*'s$", 'NOUN'),
    ('.*s$', 'NOUN'),
    ('^-?\d+(\.\d+)?$', 'NUM'),
    ('.*', 'NOUN'),
]
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.evaluate(brown_tagged_sents)

### 8

fd = nltk.FreqDist(nltk.corpus.brown.words(categories='news'))
most_freq_words = fd.most_common(100)
most_freq_words

cfd = nltk.ConditionalFreqDist(
    nltk.corpus.brown.tagged_words(
        categories='news', tagset='universal'))
likely_tags = dict([(word, cfd[word].max())
                    for (word, _) in most_freq_words])
likely_tags.items()[:10]

unigram_tagger = nltk.UnigramTagger(model=likely_tags)
unigram_tagger.evaluate(brown_tagged_sents)

### 9 (evaluate first 10 elements from a generator)

import itertools

generator = (2*x for x in xrange(100000))

list(itertools.takewhile(
    lambda (i, x): i < 10,
    enumerate(generator)
))

### 10

unigram_with_backoff = nltk.UnigramTagger(
    model=likely_tags,
    backoff=default_tagger,
)
unigram_with_backoff.evaluate(brown_tagged_sents)

### 11

fd = nltk.FreqDist(nltk.corpus.brown.words(categories='news'))
most_freq_words = fd.most_common(10000)

cfd = nltk.ConditionalFreqDist(
    nltk.corpus.brown.tagged_words(
        categories='news', tagset='universal'))
likely_tags = dict([(word, cfd[word].max())
                    for (word, _) in most_freq_words])

unigram_with_backoff_3 = nltk.UnigramTagger(
    model=likely_tags,
    backoff=default_tagger,
)
unigram_with_backoff_3.evaluate(brown_tagged_sents)

##### MACHINE LEARNING ----------------------

### 12

# 1. Change format of input
sents = nltk.corpus.treebank_raw.sents()
tokens2 = nltk.corpus.treebank_raw.words()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

# 2. Extract features
def punct_features(tokens, i):
    return {
        'next-word-capitalized': tokens[i+1][0].isupper(),
        'prev-word': tokens[i-1].lower(),
        'punct': tokens[i],
        'prev-word-is-one-char': len(tokens[i-1]) == 1,
    }

featuresets = [
    (punct_features(tokens, i), i in boundaries)
    for i in range(1, len(tokens)-1)
    if tokens[i] in '.?!'
]
featuresets[:2]

# 3. Train classifier
classifier = nltk.NaiveBayesClassifier.train(featuresets)

# 4. Use classifier on a new input
test_tokens = nltk.word_tokenize(
    'This is an example sentence'
    ', mr . Smith. And this is another one.'
)

# 4a. Extract features from the new input
test_featuresets = [
    punct_features(test_tokens, i)
    for i in range(1, len(test_tokens)-1)
    if test_tokens[i] in '.?!'
]

# 4b. Use classifier
classifier.classify_many(test_featuresets)
classifier.show_most_informative_features()

# 5. Evaluate classifier
nltk.classify.accuracy(classifier, featuresets)

### 13

# Split input data into three sets
index1 = int(len(featuresets) * 0.6)
index2 = int(len(featuresets) * 0.8)
train_set = featuresets[:index1]
devtest_set = featuresets[index1:index2]
test_set = featuresets[index2:]
len(train_set), len(devtest_set), len(test_set)

### 14. Exercise

# Male and female names have distinctive characteristics. 
# For example, if a name ends with a, e or i,
# it’s likely to be female.
# Build a classifier that looks at the first
# and last letter in a name.
# Evaluate its accuracy.
# Use names corpus.

# Step 1: Change format of input -- Build labeled_names list.
# labeled_names == [('John', 'male'), ('Alice', 'female')]

import random

male_names = nltk.corpus.names.words('male.txt')
female_names = nltk.corpus.names.words('female.txt')
labelled_male_names = [(name, 'male') 
                      for name in male_names]
labelled_female_names = [(name, 'female') 
                        for name in female_names]
labelled_names = labelled_male_names + labelled_female_names

random.shuffle(labelled_names)

# Step 2a: Build feature extractor
# >>> gender_features('Shrek')
# {'first_letter': 'S', 'last_letter': 'k'}

def gender_features(name):
    return {
        'first_letter': name[0].lower(),
        'last_letter': name[-1].lower(),
    }

gender_features('Shrek')

# Step 2b: Extract the features
# featuresets == [({'first_letter': 'S', 'last_letter': 'k'}, 'male')]

featuresets = [
    (gender_features(name), gender)
    for name, gender in labelled_names
]
featuresets[:2]

# Step 2c: Split the input into three sets
index1 = int(len(featuresets) * 0.6)
index2 = int(len(featuresets) * 0.8)
train_set = featuresets[:index1]
devtest_set = featuresets[index1:index2]
test_set = featuresets[index2:]
len(train_set), len(devtest_set), len(test_set)

# Step 3: train the classifier
bayes_classifier = nltk.NaiveBayesClassifier.train(train_set)

# Step 4: test the classifier on example names: 'Neo', 'Maria'
featureset = gender_features('Neo')
bayes_classifier.classify(featureset)

# Step 5: evaluate the classifier on devtest_set
nltk.classify.accuracy(bayes_classifier, devtest_set)

# Step 6: use DecisionTreeClassifier - train it, test it
# on example names and evaluate it

tree_classifier = nltk.DecisionTreeClassifier.train(train_set)

tree_classifier.classify(gender_features('Maria'))

nltk.classify.accuracy(tree_classifier, devtest_set)

nltk.classify.accuracy(tree_classifier, test_set)

####### CHUNKING ------------------------------

### 15

def preprocess(document):
    sentences = nltk.sent_tokenize(document)  # 2
    sentences = [nltk.word_tokenize(sent)  # 3
                 for sent in sentences]
    sentences = [nltk.pos_tag(sent, tagset='universal')
                 for sent in sentences]
    return sentences

sentences = preprocess(
    'The little yellow dog barked at the cat.')
sentences

grammar = "NP: {<DET>?<ADJ>*<NOUN>}"
cp = nltk.RegexpParser(grammar)
t = cp.parse(sentences[0])
t  # display the tree in jupyter notebook

t.draw()  # display the tree in a new window

print(t.pformat())  # display the tree in textual format

### 16

sentences = preprocess(
    'The market for system-management software for '
    'Digital\'s hardware is fragmented enough that '
    'a giant such as Computer Associates '
    'should do well there.')
result = cp.parse(sentences[0])
result

print(result.pformat())

for subtree in result.subtrees():
    print subtree.label(), subtree

### 17. Exercise

# First, find all sequences of VERB PRT VERB 
# where PRT stands for any particle (to, in).
# Use gutenberg corpus
#
# Expected output:
# (CHUNK seemed/VERB to/PRT unite/VERB)
# (CHUNK ceased/VERB to/PRT hold/VERB)
# (CHUNK left/VERB to/PRT dine/VERB)

sentences = preprocess(nltk.corpus.gutenberg.raw()[:10000])

grammar = 'CHUNK: {<VERB> <PRT> <VERB>}'
cp = nltk.RegexpParser(grammar)
t = cp.parse(sentences[0])
print(t.pformat())

# Solution:

sentences = preprocess(nltk.corpus.gutenberg.raw()[:10000])
grammar = 'CHUNK: {<VERB> <PRT> <VERB>}'
cp = nltk.RegexpParser(grammar)

for sent in sentences:
    t = cp.parse(sent)
    for subtree in t.subtrees():
        if subtree.label() == 'CHUNK':
            print subtree