Nlp-day4

From Training Material
Jump to navigation Jump to search
### 1

import nltk

def preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent, tagset='universal') for sent in sentences]
    return sentences

grammar = 'CHUNK: {<VERB> <PRT> <VERB>}'
cp = nltk.RegexpParser(grammar)
sentences = preprocess(nltk.corpus.gutenberg.raw()[:10000])
for sent in sentences:
    tree = cp.parse(sent)
    for subtree in tree.subtrees():
        if subtree.label() == 'CHUNK':
            print(subtree)

### 2

grammar = 'CHUNK: {<NOUN> <IN> <NOUN>}'
cp = nltk.RegexpParser(grammar)
sentences = preprocess(nltk.corpus.gutenberg.raw()[:100000])
for sent in sentences:
    sent = [(word, 'IN' if word == 'in' else tag)
            for word, tag in sent]
    tree = cp.parse(sent)
    for subtree in tree.subtrees():
        if subtree.label() == 'CHUNK':
            print(subtree)

### 3

grammar = '''
NP: {<.*>+}
    }<VERB|ADP>+{
'''
sentences = preprocess('The little yellow dog barked at the cat.')
cp = nltk.RegexpParser(grammar)
cp.parse(sentences[0])

##### PARSING (CONTEXT FREE GRAMMARS) -------------------

### 4

s = 'Alice reported that I think that apples are red'.split(' ')
s

grammar = nltk.CFG.fromstring('''
S -> 'I' 'think' 'that' S
S -> 'Alice' 'reported' 'that' S
S -> 'apples' 'are' 'red'
''')

parser = nltk.ChartParser(grammar)

trees = list(parser.parse(s))

for tree in trees:
    print tree

trees[0]

### 5

s = '''
Alice reported that Alice reported that
I think that apples are red
'''.split()

grammar = nltk.CFG.fromstring('''
S -> 'I' 'think' 'that' S
S -> 'Alice' 'reported' 'that' S
S -> 'apples' 'are' 'red'
''')

parser = nltk.ChartParser(grammar)
trees = list(parser.parse(s))

for tree in trees:
    print tree
    
trees[0]

### 6

# Apples are red when they are ripe
s = '''
Apples are red when they are ripe
'''.split()

grammar = nltk.CFG.fromstring('''
S -> S 'when' S
S -> 'Apples' 'are' 'red'
S -> 'they' 'are 'ripe'
''')

parser = nltk.ChartParser(grammar)
trees = list(parser.parse(s))

for tree in trees:
    print tree
    
trees[0]

### 7

s = '''
I shot an elephant in my pajamas
'''.split()

# S = sentence (I shot an elephant in my pajamas)
# PP = prepositional phrase (in my pajamas)
# VP = verb phrase (shot an elephant, shot an elephant in my pajamas)
# NP = noun phrase (I, an elephant, my pajamas, an elephant in my pajamas)
# Det = Determiner (an, my)
# N = nouns (elephant, pajamas)
# V = verbs (shot)
# P = preposition (in)

grammar = nltk.CFG.fromstring('''
S -> NP VP
PP -> P NP
VP -> V NP | VP PP
NP -> Det N | Det N PP | 'I'
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
''')

parser = nltk.ChartParser(grammar)
trees = list(parser.parse(s))

for tree in trees:
    print tree

### 8. Exercise

# Consider the sentence Kim arrived or Dana left and everyone cheered. Write down the parenthesized forms or syntax trees to show the relative scope of and and or. Write grammar to parse this sentence. Generate tree structures corresponding to both of these interpretations.

# Hint: start from writing a grammar to parse subsentences like:

# - Kim arrived,
# - Dana left,
# - everyone cheered.

s = 'Kim arrived or Dana left and everyone cheered'.split()

grammar = nltk.CFG.fromstring('''
S -> N V
S -> S P S
N -> 'Kim' | 'Dana' | 'everyone'
V -> 'arrived' | 'left' | 'cheered'
P -> 'or' | 'and'
''')

parser = nltk.ChartParser(grammar)
trees = list(parser.parse(s))

for tree in trees:
    print tree

##### FEATURE BASED GRAMMARS ------------------------------

### 9

# Develop a context free grammar to parse sentences like:

# - This dog runs.
# - These dogs run.

# Write function parse that:

# - takes a sentence (string)
# - uses global variable grammar
# - prints all trees (or a message if the sentence is invalid)
# - returns the first one.

grammar = nltk.CFG.fromstring('''
S -> P N V
P -> 'this' | 'these'
N -> 'dog' | 'dogs'
V -> 'run' | 'runs'
''')

def parse(sentence):
    parser = nltk.ChartParser(grammar)
    words = sentence.split()
    trees = list(parser.parse(words))
    for tree in trees:
        print tree
    if trees:
        return trees[0]
    else:
        print "Invalid sentence"

### 10

grammar = nltk.CFG.fromstring('''
S -> P N V
P -> 'this' | 'these'
N -> 'dog' | 'dogs'
V -> 'run' | 'runs'
''')

def parse(sentence):
    parser = nltk.ChartParser(grammar)
    words = sentence.split()
    try:
        trees = list(parser.parse(words))
    except ValueError:
        print "Invalid sentence"
    else:
        for tree in trees:
            print tree
        if trees:
            return trees[0]
        else:
            print "Invalid sentence"

### 11. Exercise

grammar = nltk.CFG.fromstring('''
S -> P_sg N_sg V_sg
S -> P_pl N_pl V_pl
P_sg -> 'this'
P_pl -> 'these'
N_sg -> 'dog'
N_pl -> 'dogs'
V_sg -> 'runs'
V_pl -> 'run'
''')
parse('this dogs run')
parse('these dogs run')
parse('this dog runs')

### 12

featured_grammar = nltk.grammar.FeatureGrammar.fromstring('''
S -> P[NUM=?n] N[NUM=?n] V[NUM=?n]
P[NUM=sg] -> 'this'
P[NUM=pl] -> 'these'
N[NUM=sg] -> 'dog'
N[NUM=pl] -> 'dogs'
V[NUM=sg] -> 'runs'
V[NUM=pl] -> 'run'
''')

def featured_parse(sentence):
    featured_parser = nltk.parse.FeatureChartParser(featured_grammar)
    tokens = sentence.split()
    trees = list(featured_parser.parse(tokens))
    for tree in trees:
        print(tree)
    if trees:
        return trees[0]
    else:
        print "Invalid sentence."

featured_parse('these dog runs')
featured_parse('this dog runs')

### 13

nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg')

def feat0_parse(sentence):
    feat0_parser = nltk.load_parser('grammars/book_grammars/feat0.fcfg')
    tokens = sentence.split()
    trees = list(feat0_parser.parse(tokens))
    for tree in trees:
        print tree
    if trees:
        return trees[0]
    else:
        print 'Invalid sentence'
        
feat0_parse('this dog disappears')

### 14

# Write a grammar to parse sentences like:

# - I am happy.
# - She is happy.
# - Kim is happy.
# - You are happy.

# And reject invalid ones like:

# - She am happy.
# - Kim are happy.

# Features:
# - NUM: sg (singular) or pl (plural)
# - PER: 1 (I, we) or 2 (you) or 3 (he, she, it, they)

Courseware:

http://training-course-material.com/training/File:Courseware.zip

Open html/index.html file.