Nlp-day2

From Training Material
Jump to navigation Jump to search
### 1

from nltk.corpus import wordnet as wn

wn.synsets('word')
car = wn.synset('car.n.01')
car.lemma_names()
car.hyponyms()
wn.all_synsets('n')
sum(1 for n in wn.all_synsets('n'))

# What percentage of noun synsets have no hyponyms?

### 2 

# What is the branching factor of noun hyponyms?
# That is, how many hyponyms on average has each noun synset?

hyponyms = [???]
sum(hyponyms) / float(len(hyponyms))

### 3

right_whale = wn.synset('right_whale.n.01')
orca = wn.synset('orca.n.01')
right_whale.lowest_common_hypernyms(orca)

baleen_whale = right_whale.hypernyms()[0]

baleen_whale.hyponyms()

right_whale.min_depth()

fruit = wn.synset('fruit.n.01')
whole = right_whale.lowest_common_hypernyms(fruit)[0]
whole

whole.min_depth()

right_whale.path_similarity(fruit)

right_whale.path_similarity(orca)

right_whale.path_similarity(right_whale)

### 4

words = ['food', 'fruit', 'car']
synsets = [synset
           for word in words
           for synset in wn.synsets(word, 'n')]

for s in synsets:
    similarities = [s.path_similarity(t)*100 for t in synsets]
    row = ' '.join('{:3.0f}'.format(w) for w in similarities)
    print('{:20} {}'.format(s.name(), row))

### 5

from urllib import urlopen

url = 'http://www.gutenberg.org/files/1112/1112.txt'
raw = urlopen(url).read()

words = nltk.word_tokenize(raw)
words[:20]

text = nltk.Text(words)

text.collocations(50)

### 6

text = 'string string string'
print text
print text[:10]
print text[5:10]
print text[5]
print text[-5]
print text.find('ing')
print text.find('sales')
print 'sales' in text
print text.replace('str', 'text')
tokens = text.split(' ')
print tokens
print '-'.join(tokens)
print '\n'.join(tokens)
print text.endswith('ing')
print text.startswith('ing')

### 7

# Suppose we have room in a crossword puzzle 
# for an 8-letter word with j as its third letter 
# and t as its sixth letter. Find matching words.

[w for w in wordlist if re.search('^..j..t..$', w)][:10]

### 8

# The T9 system is used for entering text on mobile phones.
# Two or more words that are entered with the same sequence
# of keystrokes are known as textonyms.
# For example, both hole and golf are entered by pressing
# the sequence 4653. What other words could be produced 
# with the same sequence?

# 4 = ghi
# 6 = mno
# 5 = jkl
# 3 = def

[w for w in wordlist 
 if re.search('^[ghi][mno][jkl][def]$', w)][:10]

### 9

wsj = sorted(set(nltk.corpus.treebank.words()))

# 1. Find all words that contain at least one digit.
# 2. Find four-digit numbers.
# 3. Find all integer numbers that have four or more digits.
# 4. Find decimal numbers.
# 5. Find words ending with ed or ing.
# 6. Find words like black-and-white, father-in-law, 
#    A-in-B, etc.

### 10

# What are the most common sequences of two or more vowels in 
# English language?

fd = nltk.FreqDist(vs
                   for word in wsj
                   for vs in re.findall(???))
fd.most_common()

# Expected output:
# [(u'io', 549),
#  (u'ea', 476),
#  (u'ie', 331),
#  (u'ou', 329),
#  (u'ai', 261),
#  (u'ia', 253),
#  ...]

### 11

wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.ConditionalFreqDist(
    vs
    for word in wsj
    for vs in re.findall('[aeiou]{2}', 
                         word.lower()))
fd.tabulate()

### 12

# walking -> walk
# walked -> walk
# lying -> ly

# -ing, -ed, -ly, -ious, -ies, -ive, -es, -s, -ment

def stem(word):
    regexp = ???
    stem, suffix = re.findall(regexp, word)[0]
    return stem

stem('walking')  # walk
stem('go')  # go
stem('lying')  # ly

### 12

for word in ['walking', 'walked', 'going', 'go']:
    print(re.findall('^(.*?)(ing|ed)?$', word))

for word in ['walking', 'walked', 'ing', 'going', 'go']:
    print(re.findall('^(.{2,}?)(ing|ed)?$', word))

### 13

porter = nltk.PorterStemmer()
print(porter.stem('lying'))
print(porter.stem('walking'))
print(porter.stem('walk'))
print(porter.stem('walked'))

### 14

tokens = nltk.corpus.gutenberg.words('melville-moby_dick.txt')
moby = nltk.Text(tokens)
moby.findall(r"<a> (<.*>) <man>")

### 15

# Search for patterns like "A and other Bs" in brown corpus.

# Expected output:
# companies and other corporations;
# Union and other members;
# Wagner and other officials; 
# grillwork and other things;