import os
import pandas as pd
from nltk.tokenize import RegexpTokenizer


os.chdir(os.path.dirname(os.path.realpath('__file__')))


merged = pd.read_excel('data/guba.xlsx', sheet_name='Merged')

merged = merged['Explanation'].tolist()
tokenizer = RegexpTokenizer(r'\w+')


# Tokenize

merged = [tokenizer.tokenize(merged[i]) for i in range(len(merged))]


# stop words
def word_clean(sentence, stop_words):
    sentence = [i.lower() for i in sentence]
    sentence = [token for token in sentence if not token.isnumeric()]
    sentence = [j for j in sentence if j not in stop_words]
    return sentence

stop_words = pd.read_csv('data/stopwords.txt', header=None)[0].to_list()
merged = [word_clean(sentence, stop_words) for sentence in merged]


import nltk
from nltk.stem.wordnet import WordNetLemmatizer


# Lemmatize the documents.

lemmatizer = WordNetLemmatizer()
merged = [[lemmatizer.lemmatize(token) for token in doc] for doc in merged]


# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs
bigram = Phrases(merged, min_count=1)
merged = [bigram[lst] for lst in merged]


# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
merged_dictionary = Dictionary(merged)

# Filter out words that occur less than 2 documents, or more than 60% of the documents.
merged_dictionary.filter_extremes(no_below=1, no_above=0.6)


# Bag-of-words representation of the documents.

merged_corpus = [merged_dictionary.doc2bow(doc) for doc in merged]


# Train merged model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 5
chunksize = 20
passes = 5
iterations = 200
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = merged_dictionary[0]  # This is only to "load" the dictionary.
id2word = merged_dictionary.id2token

merged_model = LdaModel(
    corpus=merged_corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',  
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)


# We need a specific version of pyLDAvis. Let's install it to current notebook directory.
! pip3 install pyLDAvis -I -t modules

Looking in indexes: https://mirrors.163.com/pypi/simple/
Collecting pyLDAvis
  Using cached pyLDAvis-3.3.1-py2.py3-none-any.whl
Collecting joblib
  Using cached https://mirrors.163.com/pypi/packages/55/85/70c6602b078bd9e6f3da4f467047e906525c355a4dacd4f71b97a35d9897/joblib-1.0.1-py3-none-any.whl (303 kB)
Collecting pandas>=1.2.0
  Using cached https://mirrors.163.com/pypi/packages/48/b4/1081d66b71c4dfc1bc1e19d6f2abbf93ed42f69df7703eb323742d45423e/pandas-1.3.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.5 MB)
Collecting future
  Using cached future-0.18.2-py3-none-any.whl
Collecting sklearn
  Using cached sklearn-0.0-py2.py3-none-any.whl
Collecting setuptools
  Using cached https://mirrors.163.com/pypi/packages/41/f4/a7ca4859317232b1efb64a826b8d2d7299bb77fb60bdb08e2bd1d61cf80d/setuptools-58.2.0-py3-none-any.whl (946 kB)
Collecting gensim
  Using cached https://mirrors.163.com/pypi/packages/61/e8/ddf62a31b4f97f543a38233047865d02be97c192f7f8d849bbf3353bc094/gensim-4.1.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.0 MB)
Collecting numexpr
  Using cached https://mirrors.163.com/pypi/packages/4e/88/ccd8973d0dde04e95f6fbc7818f770a18293de7348fc3f9b66e9bf44a2f9/numexpr-2.7.3-cp39-cp39-manylinux2010_x86_64.whl (471 kB)
Collecting numpy>=1.20.0
  Using cached https://mirrors.163.com/pypi/packages/7a/4c/dd00ce768b0f0f7de5c486cbd9f5b922bc3af2f3a5da30121d7f7dc03130/numpy-1.21.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.8 MB)
Collecting scikit-learn
  Using cached https://mirrors.163.com/pypi/packages/53/8b/99d0658d74a2e6277dbe40b6759581badb2790f6422369ae6a3d606b9164/scikit_learn-1.0.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.7 MB)
Collecting jinja2
  Using cached https://mirrors.163.com/pypi/packages/94/42/d8bca8e99789bcc35dfa9b03acaa8b518720d6e060163745bc2bf2ead842/Jinja2-3.0.2-py3-none-any.whl (133 kB)
Collecting scipy
  Using cached https://mirrors.163.com/pypi/packages/83/4a/13f813a86b7f510954bdae649f0fda718e8210320b4108656ddaf96442c9/scipy-1.7.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.8 MB)
Collecting funcy
  Using cached https://mirrors.163.com/pypi/packages/66/89/479de0afbbfb98d1c4b887936808764627300208bb771fcd823403645a36/funcy-1.15-py2.py3-none-any.whl (32 kB)
Collecting python-dateutil>=2.7.3
  Using cached https://mirrors.163.com/pypi/packages/d4/70/d60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/python_dateutil-2.8.1-py2.py3-none-any.whl (227 kB)
Collecting pytz>=2017.3
  Using cached https://mirrors.163.com/pypi/packages/70/94/784178ca5dd892a98f113cdd923372024dc04b8d40abe77ca76b5fb90ca6/pytz-2021.1-py2.py3-none-any.whl (510 kB)
Collecting six>=1.5
  Using cached https://mirrors.163.com/pypi/packages/ee/ff/48bde5c0f013094d729fe4b0316ba2a24774b3ff1c52d924a8a4cb04078a/six-1.15.0-py2.py3-none-any.whl (10 kB)
Collecting smart-open>=1.8.1
  Using cached https://mirrors.163.com/pypi/packages/cd/11/05f68ea934c24ade38e95ac30a38407767787c4e3db1776eae4886ad8c95/smart_open-5.2.1-py3-none-any.whl (58 kB)
Collecting MarkupSafe>=2.0
  Using cached https://mirrors.163.com/pypi/packages/c2/db/314df69668f582d5173922bded7b58126044bb77cfce6347c5d992074d2e/MarkupSafe-2.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (30 kB)
Collecting threadpoolctl>=2.0.0
  Using cached https://mirrors.163.com/pypi/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Installing collected packages: numpy, threadpoolctl, six, scipy, joblib, smart-open, scikit-learn, pytz, python-dateutil, MarkupSafe, sklearn, setuptools, pandas, numexpr, jinja2, gensim, future, funcy, pyLDAvis
Successfully installed MarkupSafe-2.0.1 funcy-1.15 future-0.18.2 gensim-4.1.2 jinja2-3.0.2 joblib-1.0.1 numexpr-2.7.3 numpy-1.21.2 pandas-1.3.4 pyLDAvis-3.3.1 python-dateutil-2.8.1 pytz-2021.1 scikit-learn-1.0.1 scipy-1.7.2 setuptools-58.2.0 six-1.15.0 sklearn-0.0 smart-open-5.2.1 threadpoolctl-2.1.0


# prepend the `modules` folder to Python's search path 
import sys
sys.path.insert(0, 'modules')
sys.path

['modules',
 'modules',
 '/home/fli/cloud/teaching/python/python-slides/P08-Advanced-Topics',
 '/usr/lib/python39.zip',
 '/usr/lib/python3.9',
 '/usr/lib/python3.9/lib-dynload',
 '',
 '/home/fli/.local/lib/python3.9/site-packages',
 '/usr/local/lib/python3.9/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.9/dist-packages/IPython/extensions',
 '/home/fli/.ipython']


sys.path

['modules',
 '/home/fli/cloud/teaching/python/python-slides/P08-Advanced-Topics',
 '/usr/lib/python39.zip',
 '/usr/lib/python3.9',
 '/usr/lib/python3.9/lib-dynload',
 '',
 '/home/fli/.local/lib/python3.9/site-packages',
 '/usr/local/lib/python3.9/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.9/dist-packages/IPython/extensions',
 '/home/fli/.ipython']


import pyLDAvis
from pyLDAvis import gensim_models
vis= gensim_models.prepare(merged_model, merged_corpus, dictionary=merged_dictionary)

/home/fli/cloud/teaching/python/python-slides/P08-Advanced-Topics/modules/pyLDAvis/_prepare.py:246: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only
  default_term_info = default_term_info.sort_values(


pyLDAvis.save_html(vis, 'lda.html')


vis.sorted_terms()

	Term	Freq	Total	Category	logprob	loglift	relevance
377	product	23.913529	30.062396	Topic1	-3.2928	0.9540	-3.2928
627	real_estate	10.393194	12.188516	Topic1	-4.1261	1.0235	-4.1261
246	company	7.408749	9.892665	Topic1	-4.4646	0.8937	-4.4646
309	financial	6.197204	8.785535	Topic1	-4.6432	0.8339	-4.6432
800	zhangjiang_tech	5.380072	5.830456	Topic1	-4.7846	1.1025	-4.7846
558	expected_return	5.316008	5.792948	Topic1	-4.7966	1.0970	-4.7966
520	bank	4.582158	5.053505	Topic1	-4.9451	1.0850	-4.9451
251	future	4.360551	4.810861	Topic1	-4.9947	1.0846	-4.9947
102	stock	3.904119	11.538281	Topic1	-5.1053	0.0992	-5.1053
540	customer	3.847856	4.313823	Topic1	-5.1198	1.0686	-5.1198
265	term	3.805859	5.013711	Topic1	-5.1307	0.9072	-5.1307
619	project	3.664846	5.127102	Topic1	-5.1685	0.8471	-5.1685
245	business	3.434275	6.798940	Topic1	-5.2335	0.4999	-5.2335
567	fund	3.249944	6.864512	Topic1	-5.2887	0.4352	-5.2887
582	invest	3.138705	3.589726	Topic1	-5.3235	1.0486	-5.3235
636	return	3.127770	3.583477	Topic1	-5.3270	1.0469	-5.3270
660	trust	3.127770	3.583477	Topic1	-5.3270	1.0469	-5.3270
562	financing	3.112968	3.573827	Topic1	-5.3317	1.0448	-5.3317
638	risk	3.112968	3.573827	Topic1	-5.3317	1.0448	-5.3317
666	wealth_management	3.112968	3.573827	Topic1	-5.3317	1.0448	-5.3317
241	annual	2.878278	3.321743	Topic1	-5.4101	1.0396	-5.4101
782	board	2.854602	4.410119	Topic1	-5.4184	0.7479	-5.4184
24	control	2.583636	3.367196	Topic1	-5.5181	0.9180	-5.5181
799	zhangjiang	2.400752	2.847076	Topic1	-5.5915	1.0124	-5.5915
790	park	2.400752	2.847076	Topic1	-5.5915	1.0124	-5.5915
655	threshold	2.396182	2.844386	Topic1	-5.5934	1.0114	-5.5934
625	rate	2.388913	2.841308	Topic1	-5.5965	1.0095	-5.5965
661	type	2.388913	2.841308	Topic1	-5.5965	1.0095	-5.5965
565	focus	2.377280	2.833401	Topic1	-5.6013	1.0074	-5.6013
665	wealth	2.377280	2.833401	Topic1	-5.6013	1.0074	-5.6013
604	ningbo_branch	2.377280	2.833401	Topic1	-5.6013	1.0074	-5.6013
603	ningbo	2.377280	2.833401	Topic1	-5.6013	1.0074	-5.6013
523	buy_product	2.377280	2.833401	Topic1	-5.6013	1.0074	-5.6013
518	average_annualized	2.377280	2.833401	Topic1	-5.6013	1.0074	-5.6013
766	size	2.358410	2.833941	Topic1	-5.6093	0.9992	-5.6093
721	infrastructure	2.358410	2.833941	Topic1	-5.6093	0.9992	-5.6093
716	huge	2.358410	2.833941	Topic1	-5.6093	0.9992	-5.6093
672	anxin	2.358410	2.833941	Topic1	-5.6093	0.9992	-5.6093
717	improve	2.358410	2.833941	Topic1	-5.6093	0.9992	-5.6093
794	senior_executive	2.299076	2.742155	Topic1	-5.6348	1.0066	-5.6348

Probabilistic Language Models¶

Probabilistic Language Models¶

The Goal of a Language Model:¶

How to compute this joint probability:¶

Simplifying assumption:¶