PyMC3 how to implement latent dirichlet allocation? - lda

I am trying to implement lda using PyMC3.
However, when defining the last part of the model in which words are sampled based on their topics, I keep getting the error: TypeError: list indices must be integers, not TensorVariable
How to tackle the problem?
The code is as follows:
## Data Preparation
K = 2 # number of topics
N = 4 # number of words
D = 3 # number of documents
import numpy as np
data = np.array([[1, 1, 1, 1], [1, 1, 1, 1], [0, 0, 0, 0]])
Wd = [len(doc) for doc in data] # length of each document
## Model Specification
from pymc3 import Model, Normal, HalfNormal, Dirichlet, Categorical, constant
lda_model = Model()
with lda_model:
# Priors for unknown model parameters
alpha = HalfNormal('alpha', sd=1)
eta = HalfNormal('eta', sd=1)
a1 = eta*np.ones(shape=N)
a2 = alpha*np.ones(shape=K)
beta = [Dirichlet('beta_%i' % i, a1, shape=N) for i in range(K)]
theta = [Dirichlet('theta_%s' % i, a2, shape=K) for i in range(D)]
z = [Categorical('z_%i' % d, p = theta[d], shape=Wd[d]) for d in range(D)]
# That's when you get the error. It is caused by: beta[z[d][w]]
w = [Categorical('w_%i_%i' % (d, w), p = beta[z[d][w]], observed = data[i,j]) for d in range(D) for w in range(Wd[d])]
Any help would be much appreciated!

beta[z[d][w]] is naturally incorrect because z[d][w] is a variable stored by PyMC instead of being an fixed index.
In pymc2 it is solved by lambda function
p=pm.Lambda("phi_z_%s_%s" % (d,i),
lambda z=z[d][w], beta=beta: beta[z])
In pymc3 it is suppose to be solved by
#theano.compile.ops.as_op
def your_function
But there is a problem here that it seems like Theano doesn't allow sending a python list of pymc variable. t.lvector baisically don't work.
More discussion is in this question:
Unable to create lambda function in hierarchical pymc3 model

The following code was adapted from what has been referenced by #Hanan. I've somehow made it work with pymc3.
import numpy as np
import pymc3 as pm
def get_word_dict(collection):
vocab_list = list({word for doc in collection for word in doc})
idx_list = [i for i in range(len(vocab_list))]
return dict(zip(vocab_list,idx_list))
def word_to_idx(dict_vocab_idx, collection):
return [[dict_vocab_idx[word] for word in doc] for doc in collection]
docs = [["sepak","bola","sepak","bola","bola","bola","sepak"],
["uang","ekonomi","uang","uang","uang","ekonomi","ekonomi"],
["sepak","bola","sepak","bola","sepak","sepak"],
["ekonomi","ekonomi","uang","uang"],
["sepak","uang","ekonomi"],
["komputer","komputer","teknologi","teknologi","komputer","teknologi"],
["teknologi","komputer","teknologi"]]
dict_vocab_idx = get_word_dict(docs)
idxed_collection = word_to_idx(dict_vocab_idx, docs)
n_topics = 3
n_vocab = len(dict_vocab_idx)
n_docs = len(idxed_collection)
length_docs = [len(doc) for doc in idxed_collection]
alpha = np.ones([n_docs, n_topics])
beta = np.ones([n_topics, n_vocab])
with pm.Model() as model:
theta = pm.distributions.Dirichlet('theta', a=alpha, shape=(n_docs, n_topics))
phi = pm.distributions.Dirichlet('phi', a=beta, shape=(n_topics, n_vocab))
zs = [pm.Categorical("z_d{}".format(d), p=theta[d], shape=length_docs[d]) for d in range(n_docs)]
ws = [pm.Categorical("w_{}_{}".format(d,i), p=phi[zs[d][i]], observed=idxed_collection[d][i])
for d in range(n_docs) for i in range(length_docs[d])]
trace = pm.sample(2000)
for d in range(n_docs):
value_z=trace.get_values("z_d{}".format(d))
print(value_z[1999])

check out this blog post. I haven't tested it.
import numpy as np
import pymc as pc
def wordDict(collection):
word_id = {}
idCounter = 0
for d in collection:
for w in d:
if (w not in word_id):
word_id[w] = idCounter
idCounter+=1
return word_id
def toNpArray(word_id, collection):
ds = []
for d in collection:
ws = []
for w in d:
ws.append(word_id.get(w,0))
ds.append(ws)
return np.array(ds)
###################################################
#doc1, doc2, ..., doc7
docs = [["sepak","bola","sepak","bola","bola","bola","sepak"],
["uang","ekonomi","uang","uang","uang","ekonomi","ekonomi"],
["sepak","bola","sepak","bola","sepak","sepak"],
["ekonomi","ekonomi","uang","uang"],
["sepak","uang","ekonomi"],
["komputer","komputer","teknologi","teknologi","komputer","teknologi"],
["teknologi","komputer","teknologi"]]
word_dict = wordDict(docs)
collection = toNpArray(word_dict,docs)
#number of topics
K = 3
#number of words (vocab)
V = len(word_dict)
#number of documents
D = len(collection)
#array([1, 1, 1, ..., 1]) K times
alpha = np.ones(K)
#array([1, 1, 1, ..., 1]) V times
beta = np.ones(V)
#array containing the information about doc length in our collection
Nd = [len(doc) for doc in collection]
######################## LDA model ##################################
#topic distribution per-document
theta = pc.Container([pc.CompletedDirichlet("theta_%s" % i,
pc.Dirichlet("ptheta_%s"%i, theta=alpha))
for i in range(D)])
#word distribution per-topic
phi = pc.Container([pc.CompletedDirichlet("phi_%s" % j,
pc.Dirichlet("pphi_%s" % j, theta=beta))
for j in range(K)])
#Please note that this is the tricky part :)
z = pc.Container([pc.Categorical("z_%i" % d,
p = theta[d],
size = Nd[d],
value = np.random.randint(K, size=Nd[d]))
for d in range(D)])
#word generated from phi, given a topic z
w = pc.Container([pc.Categorical("w_%i_%i" % (d,i),
p = pc.Lambda("phi_z_%i_%i" % (d,i),
lambda z=z[d][i], phi=phi : phi[z]),
value=collection[d][i],
observed=True)
for d in range(D) for i in range(Nd[d])])
####################################################################
model = pc.Model([theta, phi, z, w])
mcmc = pc.MCMC(model)
mcmc.sample(iter=5000, burn=1000)
#show the topic assignment for each word, using the last trace
for d in range(D):
print(mcmc.trace('z_%i'%d)[3999])

Related

Substituting undefined function in symbolic expression

In an expression depending on an unknown function f of r**2, I would like to replace the function f by some actual function and display the result but I cannot find a way to do it. Here's an example:
r = symbols('r', real= True)
phi = r * Function('phi')(r**2)
dphi = phi.diff(r)
print(dphi)
At this stage I get:
2*r**2*Subs(Derivative(phi(_xi_1), _xi_1), _xi_1, r**2) + phi(r**2)
Now let's assume I would like to evaluate dphi when phi(y) = y.
This should give me:
2*r**2 + r**2 = 3*r**3
How do I make the actual substitution of phi in dphi to obtain the desired result ?
#Davide_sd This is an example that works as I expect (but for a function of r alone):
r = symbols('r', real= True)
phi = Function('phi')(r)
om = r * phi
dom = om.diff(r)
dom.subs(phi, sin(r)).doit()
output: r * cos(r) +sin(r)
But I would like to have for example (does not work):
r = symbols('r', real= True)
phi = Function('phi')(r**2)
om = r * phi
dom = om.diff(r)
dom.subs(phi, sin(r)).doit()
output: 2*r**2*Subs(Derivative(phi(_xi_1), _xi_1), _xi_1, r**2) + phi(r**2)
Instead I would like to get:
2*r**2*cos(r**2) + sin(2*r**2)
Thanks in advance for any help,
Regards,
Bernard.
If you want phi(y) = y this is the Id function; replacement can be done as:
>>> from sympy import Id
>>> dphi.subs(Function('phi'), Id).doit() # where dphi is as given at the start
3*r**2
In your example for dom it is not clear what function mapping you want since you use phi for the expression Function('phi')(r**2) and for the function Function('phi'). If you mean the same as in the first example -- something like phi(r) = sin(r) then you would just replace Function('phi') with sin.
Another way to approach this is by using replace to find occurrences of the function of interest and replace them with desired value, regardless of argument.
>>> p = Function('phi')
>>> (1+p(x)).replace(lambda x:x.func==p, lambda x:x.args[0]**3)
x**3 + 1
Based on the above answer and another usage of replace that I discovered here (Why does substitution into a sympy derivative only partly work), here is a summary of two solutions that do exactly what I need:
>>> r = symbols('r', real= True)
>>> phi = Function('phi')
>>> om = r * phi(r**2)
>>> dom = om.diff(r)
# Solution 1
>>> dom.replace(lambda x: x.func == phi, lambda x: x.args[0]**3).doit()
7*r**6
# Solution 2
>>> y = Wild("y")
>>> dom.replace(phi(y), y**3).doit()
7*r**6

WHat does Lambda do in this code (python keras)?

def AdaIN(x):
#Normalize x[0] (image representation)
mean = K.mean(x[0], axis = [1, 2], keepdims = True)
std = K.std(x[0], axis = [1, 2], keepdims = True) + 1e-7
y = (x[0] - mean) / std
#Reshape scale and bias parameters
pool_shape = [-1, 1, 1, y.shape[-1]]
scale = K.reshape(x[1], pool_shape)
bias = K.reshape(x[2], pool_shape)#Multiply by x[1] (GAMMA) and add x[2] (BETA)
return y * scale + bias
def g_block(input_tensor, latent_vector, filters):
gamma = Dense(filters, bias_initializer = 'ones')(latent_vector)
beta = Dense(filters)(latent_vector)
out = UpSampling2D()(input_tensor)
out = Conv2D(filters, 3, padding = 'same')(out)
out = Lambda(AdaIN)([out, gamma, beta])
out = Activation('relu')(out)
return out
Please see code above. I am currently studying styleGAN. I am trying to convert this code into pytorch but I cant seem to understand what does Lambda do in g_block. AdaIN needs only one input based on its declaration but some how is gamma and beta also used as input? Please inform me what does the Lambda do in this code.
Thank you very much.
Lambda layers in keras are used to call custom functions inside the model. In g_block Lambda calls AdaIN function and passes out, gamma, beta as arguments inside a list. And AdaIN function receives these 3 tensors encapsulated within a single list as x. And also those tensors are accessed inside AdaIN function by indexing list x(x[0], x[1], x[2]).
Here's pytorch equivalent:
import torch
import torch.nn as nn
import torch.nn.functional as F
class AdaIN(nn.Module):
def forward(self, out, gamma, beta):
bs, ch = out.size()[:2]
mean = out.reshape(bs, ch, -1).mean(dim=2).reshape(bs, ch, 1, 1)
std = out.reshape(bs, ch, -1).std(dim=2).reshape(bs, ch, 1, 1) + 1e-7
y = (out - mean) / std
bias = beta.unsqueeze(-1).unsqueeze(-1).expand_as(out)
scale = gamma.unsqueeze(-1).unsqueeze(-1).expand_as(out)
return y * scale + bias
class g_block(nn.Module):
def __init__(self, filters, latent_vector_shape, input_tensor_channels):
super().__init__()
self.gamma = nn.Linear(in_features = latent_vector_shape, out_features = filters)
# Initializes all bias to 1
self.gamma.bias.data = torch.ones(filters)
self.beta = nn.Linear(in_features = latent_vector_shape, out_features = filters)
# calculate appropriate padding
self.conv = nn.Conv2d(input_tensor_channels, filters, 3, 1, padding=1)# calc padding
self.adain = AdaIN()
def forward(self, input_tensor, latent_vector):
gamma = self.gamma(latent_vector)
beta = self.beta(latent_vector)
# check default interpolation mode in keras and replace mode below if different
out = F.interpolate(input_tensor, scale_factor=2, mode='nearest')
out = self.conv(out)
out = self.adain(out, gamma, beta)
out = torch.relu(out)
return out
# Sample:
input_tensor = torch.randn((1, 3, 10, 10))
latent_vector = torch.randn((1, 5))
g = g_block(3, latent_vector.shape[1], input_tensor.shape[1])
out = g(input_tensor, latent_vector)
print(out)
Note: you need to pass latent_vector and input_tensor shapes while creating g_block.

How do i measure perplexity scores on a LDA model made with the textmineR package in R?

I've made a LDA topic model in R, using the textmineR package, it looks as follows.
## get textmineR dtm
dtm2 <- CreateDtm(doc_vec = dat2$fulltext, # character vector of documents
ngram_window = c(1, 2),
doc_names = dat2$names,
stopword_vec = c(stopwords::stopwords("da"), custom_stopwords),
lower = T, # lowercase - this is the default value
remove_punctuation = T, # punctuation - this is the default
remove_numbers = T, # numbers - this is the default
verbose = T,
cpus = 4)
dtm2 <- dtm2[, colSums(dtm2) > 2]
dtm2 <- dtm2[, str_length(colnames(dtm2)) > 2]
############################################################
## RUN & EXAMINE TOPIC MODEL
############################################################
# Draw quasi-random sample from the pc
set.seed(34838)
model2 <- FitLdaModel(dtm = dtm2,
k = 8,
iterations = 500,
burnin = 200,
alpha = 0.1,
beta = 0.05,
optimize_alpha = TRUE,
calc_likelihood = TRUE,
calc_coherence = TRUE,
calc_r2 = TRUE,
cpus = 4)
The questions are then:
1. Which function should i apply to get the perplexity scores in the textmineR package? I can't seem to find one.
2. how do i measure complexity scores for different numbers of topics(k)?
As asked: there's no way to calculate perplexity with textmineR unless you explicitly program it yourself. TBH, I've never seen value of perplexity that you couldn't get with likelihood and coherence, so I didn't implement it.
However, the text2vec package does have an implementation. See below for example:
library(textmineR)
# model ships with textmineR as example
m <- nih_sample_topic_model
# dtm ships with textmineR as example
d <- nih_sample_dtm
# get perplexity
p <- text2vec::perplexity(X = d,
topic_word_distribution = m$phi,
doc_topic_distribution = m$theta)

Additional seedwords argument in LDA() function from topicmodels

I am looking for an in depth example of Latent Dirichlet Allocation (LDA) with seedwords specified for the topicmodels package in R.
The basic function takes on the form:
LDA(x, k, method = "Gibbs", control = NULL, model = NULL, ...)
And the documentation only states:
For method = "Gibbs" an additional argument seedwords can be specified
as a matrix or an object of class "simple_triplet_matrix"; the default
is NULL.
Can anyone point me to a complete example of how this would look and function?
Taken from this answer:
https://stats.stackexchange.com/questions/384183/seeded-lda-using-topicmodels-in-r
library("topicmodels")
data("AssociatedPress", package = "topicmodels")
## We fit 6 topics.
## We specify five seed words for five topics, the sixth topic has no
## seed words.
library("slam")
set.seed(123)
i <- rep(1:5, each = 5)
j <- sample(1:ncol(AssociatedPress), 25)
SeedWeight <- 500 - 0.1
deltaS <- simple_triplet_matrix(i, j, v = rep(SeedWeight, 25),
nrow = 6, ncol = ncol(AssociatedPress))
set.seed(1000)
ldaS <- LDA(AssociatedPress, k = 6, method = "Gibbs", seedwords = deltaS,
control = list(alpha = 0.1, best = TRUE,
verbose = 500, burnin = 500, iter = 100, thin = 100, prefix = character()))
apply(deltaS, 1, function(x) which(x == SeedWeight))
apply(posterior(ldaS)$terms, 1, function(x) order(x, decreasing = TRUE)[1:5])

MXNET CNN+LSTM save/serialize to json

I'm finding a hardtime figuring out how to correctly define a mxnet net so that i can serialize/convert this model to a json file.
The pipeline is composed of a CNN + biLSTM + CTC.
I now i must use HybridBlock and hybridize() but i can't seem to make it work or if its even possible or if there is any other way around.
I'm sure its lack of knowledge on my part and wonder is anyone can help.
Here is the net definition in python:
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
# conv layer
featurizer.add(gluon.nn.Conv2D(kernel_size=(3,3), padding=(1,1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
....
featurizer.hybridize()
return featurizer
class EncoderLayer(gluon.Block):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def forward(self, x):
x = x.transpose((0,3,1,2))
x = x.flatten()
x = x.split(num_outputs=SEQ_LEN, axis = 1) # (SEQ_LEN, N, CHANNELS)
x = nd.concat(*[elem.expand_dims(axis=0) for elem in x], dim=0)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.Sequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
decoder.hybridize()
return decoder
def get_net():
net = gluon.nn.Sequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
Any help would be highly appreciated.
Thank you very much.
There are few requirements for a model in Gluon to be exportable to json:
It needs to be hybridizable, meaning that each children block should be hybridizable as well and the model works in both modes
All parameters should be initialized. Since Gluon uses deferred parameter initialization, that means that you should do forward pass at least once before you can save the model.
I did some fixes for your code also introducing new constants when I needed. The most significant changes are:
Don't use split if you can avoid it, because it returns list of NDArrays. Use reshape, which works seemlessly with Symbol as well.
Starting from 1.3.0 version of MXNet, LSTM is also hybridizable, so you can wrap it in a HybridBlock instead of just a Block.
Use HybridSequential.
Here is the adjusted code with an example at the bottom how to save the model and how to load it back. You can find more information in this tutorial.
import mxnet as mx
from mxnet import gluon
from mxnet import nd
BATCH_SIZE = 1
CHANNELS = 100
ALPHABET_SIZE = 1000
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
HEIGHT = 100
WIDTH = 100
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
featurizer.add(
gluon.nn.Conv2D(kernel_size=(3, 3), padding=(1, 1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
return featurizer
class EncoderLayer(gluon.HybridBlock):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def hybrid_forward(self, F, x):
x = x.transpose((0, 3, 1, 2))
x = x.flatten()
x = x.reshape(shape=(SEQ_LEN, -1, CHANNELS)) #x.split(num_outputs=SEQ_LEN, axis=1) # (SEQ_LEN, N, CHANNELS)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.HybridSequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
return decoder
def get_net():
net = gluon.nn.HybridSequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
if __name__ == '__main__':
net = get_net()
net.initialize()
net.hybridize()
fake_data = mx.random.uniform(shape=(BATCH_SIZE, HEIGHT, WIDTH, CHANNELS))
out = net(fake_data)
net.export("mymodel")
deserialized_net = gluon.nn.SymbolBlock.imports("mymodel-symbol.json", ['data'],
"mymodel-0000.params", ctx=mx.cpu())
out2 = deserialized_net(fake_data)
# just to check that we get the same results
assert (out - out2).sum().asscalar() == 0