<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-GB">
	<id>https://training-course-material.com/index.php?action=history&amp;feed=atom&amp;title=Nlp</id>
	<title>Nlp - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://training-course-material.com/index.php?action=history&amp;feed=atom&amp;title=Nlp"/>
	<link rel="alternate" type="text/html" href="https://training-course-material.com/index.php?title=Nlp&amp;action=history"/>
	<updated>2026-05-13T23:42:12Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.45.1</generator>
	<entry>
		<id>https://training-course-material.com/index.php?title=Nlp&amp;diff=88847&amp;oldid=prev</id>
		<title>Pszlachta1 at 10:12, 3 June 2025</title>
		<link rel="alternate" type="text/html" href="https://training-course-material.com/index.php?title=Nlp&amp;diff=88847&amp;oldid=prev"/>
		<updated>2025-06-03T10:12:57Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table style=&quot;background-color: #fff; color: #202122;&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en-GB&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;Revision as of 10:12, 3 June 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l157&quot;&gt;Line 157:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 157:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;     ???&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;     ???&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;source&lt;/del&gt;&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;/&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;syntaxhighlight&lt;/ins&gt;&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Pszlachta1</name></author>
	</entry>
	<entry>
		<id>https://training-course-material.com/index.php?title=Nlp&amp;diff=88845&amp;oldid=prev</id>
		<title>Pszlachta1 at 10:10, 3 June 2025</title>
		<link rel="alternate" type="text/html" href="https://training-course-material.com/index.php?title=Nlp&amp;diff=88845&amp;oldid=prev"/>
		<updated>2025-06-03T10:10:38Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table style=&quot;background-color: #fff; color: #202122;&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en-GB&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;Revision as of 10:10, 3 June 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l1&quot;&gt;Line 1:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 1:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;{{Cat|NLP}}&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;{{Cat|NLP}}&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;source &lt;/del&gt;lang=&quot;python&quot;&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;syntaxhighlight &lt;/ins&gt;lang=&quot;python&quot;&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;### 1&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;### 1&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Pszlachta1</name></author>
	</entry>
	<entry>
		<id>https://training-course-material.com/index.php?title=Nlp&amp;diff=50156&amp;oldid=prev</id>
		<title>Bernard Szlachta at 14:20, 16 December 2016</title>
		<link rel="alternate" type="text/html" href="https://training-course-material.com/index.php?title=Nlp&amp;diff=50156&amp;oldid=prev"/>
		<updated>2016-12-16T14:20:01Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;{{Cat|NLP}}&lt;br /&gt;
&amp;lt;source lang=&amp;quot;python&amp;quot;&amp;gt;&lt;br /&gt;
&lt;br /&gt;
### 1&lt;br /&gt;
&lt;br /&gt;
words = (&amp;#039;The Tragedie of Macbeth by William Shakespeare . &amp;#039;&lt;br /&gt;
         &amp;#039;The Tragedie is great .&amp;#039;).split(&amp;#039; &amp;#039;)&lt;br /&gt;
trigrams = nltk.trigrams(words)&lt;br /&gt;
mapper = {}&lt;br /&gt;
for a, b, c in trigrams:&lt;br /&gt;
    key = (a, b)&lt;br /&gt;
    if key not in mapper:&lt;br /&gt;
        mapper[key] = []&lt;br /&gt;
    mapper[key].append(c)&lt;br /&gt;
mapper&lt;br /&gt;
&lt;br /&gt;
### 2&lt;br /&gt;
&lt;br /&gt;
from random import choice&lt;br /&gt;
&lt;br /&gt;
def generate(word1, word2, N):&lt;br /&gt;
    sent = [word1, word2]&lt;br /&gt;
    for i in xrange(N):&lt;br /&gt;
        words = mapper[(word1, word2)]&lt;br /&gt;
        next_word = choice(words)&lt;br /&gt;
        sent.append(next_word)&lt;br /&gt;
        word1, word2 = word2, next_word&lt;br /&gt;
    return sent&lt;br /&gt;
    &lt;br /&gt;
word1, word2 = &amp;#039;The&amp;#039;, &amp;#039;Tragedie&amp;#039;&lt;br /&gt;
N = 5&lt;br /&gt;
generate(word1, word2, N)&lt;br /&gt;
&lt;br /&gt;
### 3&lt;br /&gt;
&lt;br /&gt;
# words = (&amp;#039;The Tragedie of Macbeth by William Shakespeare . &amp;#039;&lt;br /&gt;
#          &amp;#039;The Tragedie is great .&amp;#039;).split(&amp;#039; &amp;#039;)&lt;br /&gt;
words = nltk.corpus.gutenberg.words(&amp;#039;austen-emma.txt&amp;#039;)&lt;br /&gt;
trigrams = nltk.trigrams(words)&lt;br /&gt;
mapper = {}&lt;br /&gt;
for a, b, c in trigrams:&lt;br /&gt;
    key = (a, b)&lt;br /&gt;
    if key not in mapper:&lt;br /&gt;
        mapper[key] = []&lt;br /&gt;
    mapper[key].append(c)&lt;br /&gt;
mapper&lt;br /&gt;
&lt;br /&gt;
from random import choice&lt;br /&gt;
&lt;br /&gt;
def generate(word1, word2, N):&lt;br /&gt;
    sent = [word1, word2]&lt;br /&gt;
    for i in xrange(N):&lt;br /&gt;
        words = mapper[(word1, word2)]&lt;br /&gt;
        next_word = choice(words)&lt;br /&gt;
        sent.append(next_word)&lt;br /&gt;
        word1, word2 = word2, next_word&lt;br /&gt;
    return sent&lt;br /&gt;
    &lt;br /&gt;
word1, word2 = choice(mapper.keys())&lt;br /&gt;
N = 50&lt;br /&gt;
tokens = generate(word1, word2, N)&lt;br /&gt;
print &amp;#039; &amp;#039;.join(tokens)&lt;br /&gt;
&lt;br /&gt;
### 4 &lt;br /&gt;
&lt;br /&gt;
# Exercise: &lt;br /&gt;
# Write function find_language that takes a word and returns a list of language that this word may be in. Use udhr corpus. Narrow down your search scope to files in Latin1 encoding. Lookup words one and ein.&lt;br /&gt;
&lt;br /&gt;
files = [filename &lt;br /&gt;
         for filename in nltk.corpus.udhr.fileids()&lt;br /&gt;
         if filename.endswith(&amp;#039;Latin1&amp;#039;)]&lt;br /&gt;
files&lt;br /&gt;
&lt;br /&gt;
def find_language(word):&lt;br /&gt;
    return [&lt;br /&gt;
        language&lt;br /&gt;
        for language in files &lt;br /&gt;
        if word in nltk.corpus.udhr.words(language)&lt;br /&gt;
    ]&lt;br /&gt;
&lt;br /&gt;
find_language(&amp;#039;one&amp;#039;)&lt;br /&gt;
&lt;br /&gt;
### 5&lt;br /&gt;
&lt;br /&gt;
words = nltk.corpus.gutenberg.words(&amp;#039;austen-emma.txt&amp;#039;)&lt;br /&gt;
words = [w.lower() for w in words if w.isalpha()]&lt;br /&gt;
fd = nltk.FreqDist(words)&lt;br /&gt;
&lt;br /&gt;
fd[&amp;#039;the&amp;#039;]&lt;br /&gt;
&lt;br /&gt;
fd.freq(&amp;#039;the&amp;#039;)&lt;br /&gt;
&lt;br /&gt;
fd.most_common()[:10]&lt;br /&gt;
&lt;br /&gt;
### 6&lt;br /&gt;
&lt;br /&gt;
# Zipf’s Law states that the frequency of a word is inversely proportional to its rank. Does this law holds for English?&lt;br /&gt;
&lt;br /&gt;
# Hint: use logarithmic x and y axes when plotting number of occurrences against rank.&lt;br /&gt;
&lt;br /&gt;
fd = nltk.FreqDist(nltk.corpus.gutenberg.words(&amp;#039;bible-kjv.txt&amp;#039;))&lt;br /&gt;
occurences = [? for x in fd.most_common()]&lt;br /&gt;
&lt;br /&gt;
from matplotlib import pylab&lt;br /&gt;
pylab.xscale(&amp;#039;log&amp;#039;)&lt;br /&gt;
pylab.yscale(&amp;#039;log&amp;#039;)&lt;br /&gt;
pylab.plot(occurences)&lt;br /&gt;
&lt;br /&gt;
### 7&lt;br /&gt;
&lt;br /&gt;
cfd = nltk.ConditionalFreqDist(&lt;br /&gt;
    (category, word)&lt;br /&gt;
    for category in nltk.corpus.brown.categories()&lt;br /&gt;
    for word in nltk.corpus.brown.words(categories=category)&lt;br /&gt;
)&lt;br /&gt;
&lt;br /&gt;
### 8 &lt;br /&gt;
&lt;br /&gt;
# How frequently do letters appear in different languages?&lt;br /&gt;
# Use swadesh corpus.&lt;br /&gt;
cfd = nltk.ConditionalFreqDist([&lt;br /&gt;
    ???&lt;br /&gt;
    for language in ???&lt;br /&gt;
    for letter in ???&lt;br /&gt;
    if ???  # if letter is a character, not whitespace or punctuation&lt;br /&gt;
])&lt;br /&gt;
cfd.tabulate(samples=&amp;#039;abcdefghijklmnopqrstuvwxyz&amp;#039;)&lt;br /&gt;
&lt;br /&gt;
### 9&lt;br /&gt;
&lt;br /&gt;
# Which initial letters are more frequent for males versus females?&lt;br /&gt;
# Use names corpus.&lt;br /&gt;
&lt;br /&gt;
cfd = nltk.ConditionalFreqDist([&lt;br /&gt;
    ???&lt;br /&gt;
    for gender in ???&lt;br /&gt;
    for name in ???&lt;br /&gt;
])&lt;br /&gt;
cfd.plot()&lt;br /&gt;
&lt;br /&gt;
### 10&lt;br /&gt;
&lt;br /&gt;
# What is the lexical diversity of different genres? Lexical diversity is the ratio of number of words and number of unique words. That is, it’s the average number of times each word appear in the text.&lt;br /&gt;
# Use brown corpus.&lt;br /&gt;
&lt;br /&gt;
# 1. Write the function that computes lexical diversity:&lt;br /&gt;
&lt;br /&gt;
def lexical_diversity(words):&lt;br /&gt;
    return ???&lt;br /&gt;
&lt;br /&gt;
# 2. Test that lexical_diversity function works:&lt;br /&gt;
&amp;gt;&amp;gt;&amp;gt; lexical_diversity([&amp;#039;a&amp;#039;, &amp;#039;b&amp;#039;, &amp;#039;b&amp;#039;])&lt;br /&gt;
1.5&lt;br /&gt;
&lt;br /&gt;
# 3. Compute lexical diversity for each category from brown corpus. &lt;br /&gt;
for category in ???:&lt;br /&gt;
    ???&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/source&amp;gt;&lt;/div&gt;</summary>
		<author><name>Bernard Szlachta</name></author>
	</entry>
</feed>