import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('treebank')

[nltk_data] Downloading package punkt to /home/fli/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/fli/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package treebank to /home/fli/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.

True


para = "Python is a widely used general-purpose, high-level programming language. \
        Its design philosophy emphasizes code readability, and its syntax allows programmers \
        to express concepts in fewer lines of code than would be possible in languages such as \
        C++ or Java. The language provides constructs intended to enable clear programs \
        on both a small and large scale."


from nltk.tokenize import sent_tokenize
sent_tokenize(para)

['Python is a widely used general-purpose, high-level programming language.',
 'Its design philosophy emphasizes code readability, and its syntax allows programmers         to express concepts in fewer lines of code than would be possible in languages such as         C++ or Java.',
 'The language provides constructs intended to enable clear programs         on both a small and large scale.']


from nltk.tokenize import word_tokenize
word_tokenize('Hello World.')

['Hello', 'World', '.']


tok = word_tokenize(para)
print(tok)

['Python', 'is', 'a', 'widely', 'used', 'general-purpose', ',', 'high-level', 'programming', 'language', '.', 'Its', 'design', 'philosophy', 'emphasizes', 'code', 'readability', ',', 'and', 'its', 'syntax', 'allows', 'programmers', 'to', 'express', 'concepts', 'in', 'fewer', 'lines', 'of', 'code', 'than', 'would', 'be', 'possible', 'in', 'languages', 'such', 'as', 'C++', 'or', 'Java', '.', 'The', 'language', 'provides', 'constructs', 'intended', 'to', 'enable', 'clear', 'programs', 'on', 'both', 'a', 'small', 'and', 'large', 'scale', '.']


from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize("Can't is a contraction.")

["Can't", 'is', 'a', 'contraction']


from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
print(english_stops)

{'at', 'be', 'yours', 'own', 'if', 'we', 'that', 'll', 'again', 'had', 'you', 'aren', 'their', 'in', 't', "couldn't", 'ain', 'an', 'which', 'from', "mustn't", 'can', "mightn't", 'i', 'more', 'yourselves', 'as', 'should', 'weren', 'the', 'but', 'very', 'until', 'just', 'with', 'wouldn', 'too', 'below', 'she', 'further', 'will', 'now', 'why', "aren't", 'than', 'do', 'have', 'to', "don't", "hasn't", "you're", 'this', 'did', "won't", 'won', 'and', 'him', 'am', 'other', 'it', 'hers', 've', 'wasn', 'off', 'they', 'above', 'them', "haven't", 'before', 'where', 'there', 'being', 'nor', 'our', 'who', 'been', 'by', 'some', 'has', 'only', 'on', 'd', 'through', 'm', 'is', 'didn', 'ourselves', 'theirs', 'does', 'about', 'needn', 'those', 'between', "that'll", 'or', 'under', 'no', 'shan', "wasn't", "you'll", 'not', "hadn't", 'both', 'himself', "shouldn't", 'out', 'mustn', 'hadn', 'during', 'don', 'while', 'same', 'so', 'whom', 'then', 'few', 'shouldn', 'for', 'of', 'hasn', 'such', 'how', 'are', 'doing', 'after', 'its', "needn't", 'most', 're', 'isn', "shan't", "weren't", 'up', "isn't", 'his', 'haven', 'down', 'itself', "you've", "should've", 'her', "didn't", 'my', 'because', 'themselves', 'all', "doesn't", 'having', 'here', 's', 'myself', "wouldn't", "you'd", "it's", 'once', 'herself', 'each', 'mightn', 'ours', 'over', 'into', 'when', 'your', 'was', 'these', 'o', 'were', 'a', 'me', 'he', "she's", 'any', 'doesn', 'what', 'against', 'y', 'couldn', 'ma', 'yourself'}


words = ["Can't", 'is', 'a', 'contraction']


[word for word in words if word not in english_stops]

["Can't", 'contraction']


from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('cooking')

'cook'


import re
from nltk.corpus import wordnet

replacement_patterns = [
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'i\'m', 'i am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer(object):
    
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in
                         patterns]
        
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            (s, count) = re.subn(pattern, repl, s)
        return s
    
class RepeatReplacer(object):
    
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word


from nltk.tag import UnigramTagger
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)
treebank.sents()[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']


tagger.tag(treebank.sents()[0])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]


import matplotlib.pyplot as plt 


def plot_resumes(plt):
    data = [ ("big data", 100, 15), ("Hadoop", 95, 25), ("Python", 75, 50),
         ("R", 50, 40), ("machine learning", 80, 20), ("statistics", 20, 60),
         ("data science", 60, 70), ("analytics", 90, 3),
         ("team player", 85, 85), ("dynamic", 2, 90), ("synergies", 70, 0),
         ("actionable insights", 40, 30), ("think out of the box", 45, 10),
         ("self-starter", 30, 50), ("customer focus", 65, 15),
         ("thought leadership", 35, 35)]

    def text_size(total):
        """equals 8 if total is 0, 28 if total is 200"""
        return 8 + total / 200 * 20

    for word, job_popularity, resume_popularity in data:
        plt.text(job_popularity, resume_popularity, word,
                 ha='center', va='center',
                 size=text_size(job_popularity + resume_popularity))
    plt.xlabel("Popularity on Job Postings")
    plt.ylabel("Popularity on Resumes")
    plt.axis([0, 100, 0, 100])
    plt.show()
    

plot_resumes(plt)

Natural Language Processing with Python¶

Concepts in text processing (文本处理基本概念)¶

Corpora (语料库)¶

Tokens¶

Stopwords （停词）¶

Stemming （词根检索）¶

Frequency Counts （频数统计）¶

Word Segmenter （分词）¶

Part-Of-Speech Tagger （词性标注工具）¶

Named Entity Recognizer（命名实体识别工具）¶

Natural Language Processing tools （自然语言处理工具）¶

Tokenizing Text （标记文本）¶

Tokenizing text into sentences （断句）¶

Tokenizing sentences into words （分词）¶

Tokenizing sentences using regular expressions （使用正则表达式分词）¶

Filtering stopwords in a tokenized sentence （过滤停词）¶

Replacing and Correcting Words （替换和纠正）¶

Stemming （词干提取）¶

Removing repeating characters （删除重复字符）¶

Spelling correction （拼写纠正）¶

Replacing synonyms （同义词替换）¶

Part-of-speech Tagging （词性标注工具）¶

Training a unigram part-of-speech tagger¶

Word Clouds¶

Text Classification （文本分类）¶