Predict sentiment score using multiclass logistic regression with R - regression

I am trying to create a sentiment analysis classifier using logistic regression with R (glmnet).. Here is the R code :
library(tidyverse)
library(text2vec)
library(caret)
library(glmnet)
library(ggrepel)
Train_classifier <- read.csv('IRC.csv',header=T, sep=";")
Test_classifier <- read.csv('IRC2.csv',header=T, sep=";")
# select only 4 column of the dataframe
Train <- Train_classifier[, c("Note.Reco", "Raison.Reco", "DATE_SAISIE", "idpart")]
Test <- Test_classifier[, c("Note.Reco", "Raison.Reco", "DATE_SAISIE", "idpart")]
#delete rows with empty value columns
subTrain <- filter(Train, trimws(Raison.Reco)!=" ")
subTrain$ID <- seq.int(nrow(subTrain))
# # replacing class values
subTrain$Note.Reco = ifelse(subTrain$Note.Reco >= 0 & subTrain$Note.Reco <= 4, 0, ifelse(subTrain$Note.Reco >= 5 &
subTrain$Note.Reco <= 6, 1, ifelse(subTrain$Note.Reco >= 7 & subTrain$Note.Reco <= 8, 2, 3)))
subTest <- filter(Test, trimws(Raison.Reco)!=" ")
subTest$ID <- seq.int(nrow(subTest))
#Data pre processing
#Doc2Vec
prep_fun <- tolower
tok_fun <- word_tokenizer
subTrain[] <- lapply(subTrain, as.character)
it_train <- itoken(subTrain$Raison.Reco,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = subTrain$ID,
progressbar = TRUE)
subTest[] <- lapply(subTest, as.character)
it_test <- itoken(subTest$Raison.Reco,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = subTest$ID,
progressbar = TRUE)
#creation of vocabulairy and term document matrix
### fichier d'apprentissage
vocab_train <- create_vocabulary(it_train)
vectorizer_train <- vocab_vectorizer(vocab_train)
dtm_train <- create_dtm(it_train, vectorizer)
### test data
vocab_test <- create_vocabulary(it_test)
vectorizer_test <- vocab_vectorizer(vocab_test)
dtm_test <- create_dtm(it_test, vectorizer_test)
##Define tf-idf model
tfidf <- TfIdf$new()
# fit the model to the train data and transform it with the fitted model
dtm_train_tfidf <- fit_transform(dtm_train, tfidf)
dtm_test_tfidf <- fit_transform(dtm_test, tfidf)
glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf,
y = subTrain[['Note.Reco']],
family = 'multinomial',
# L1 penalty
alpha = 1,
# interested in the area under ROC curve
type.measure = "auc",
# 5-fold cross-validation
nfolds = 5,
# high value is less accurate, but has faster training
thresh = 1e-3,
# again lower number of iterations for faster training
maxit = 1e3)
plot(glmnet_classifier)
Here is the struct of the data subTrain :
[![Note.Reco Raison.Reco DATE_SAISIE idpart ID
3 Good service 19/03/2014 56992
2 good stuff 19/03/2014 53645
8 very nice 20/02/2016 261392
...][1]][1]
I get this plot (attached file) Can you explain me more if it is true Thank you

Related

R - specifying interaction contrasts for aov

How to specificy the contrasts (point estimates, 95CI and p-values) for the between-group differences of the within-group delta changes?
In the example below, I would be interest in the between-groups (group = 1 minus group = 2) of delta changes (time = 3 minus time = 1).
df and model:
demo3 <- read.csv("https://stats.idre.ucla.edu/stat/data/demo3.csv")
## Convert variables to factor
demo3 <- within(demo3, {
group <- factor(group)
time <- factor(time)
id <- factor(id)
})
par(cex = .6)
demo3$time <- as.factor(demo3$time)
demo3.aov <- aov(pulse ~ group * time + Error(id), data = demo3)
summary(demo3.aov)
Neither of these chunks of code achieve my goal, correct?
m2 <- emmeans(demo3.aov, "group", by = "time")
pairs(m2)
m22 <- emmeans(demo3.aov, c("group", "time") )
pairs(m22)
Look at the documentation for emmeans::contrast and in particular the argument interaction. If I understand your question correctly, you might want
summary(contrast(m22, interaction = c("pairwise", "dunnett")),
infer = c(TRUE, TRUE))
which would compute Dunnett-style contrasts for time (each time vs. time1), and compare those for group1 - group2. The summary(..., infer = c(TRUE, TRUE)) part overrides the default that tests but not CIs are shown.
You could also do this in stanges:
time.con <- contrast(m22, "dunnett", by = "group", name = "timediff")
summary(pairs(time.con, by = NULL), infer = c(TRUE, TRUE))
If you truly want just time 3 - time 1, then replace time.con with
time.con1 <- contrast(m22, list(`time3-time1` = c(-1, 0, 1, 0, 0))
(I don't know how many times you have. I assumed 5 in the above.)

Difference in Computation Speed and Results Between MLR and MLR3

I don't get similar results when I use the same data and models using mlr and mlr3. Also I find mlr runs at least 20-fold faster. I used lung data from survival and I was able to replicate the difference in computation speed and results since I can't share my data.
mlr was completed in 1 min with C-index generally low compared to mlr3 that took 21 min to complete with C-index being much higher despite using same data, same preprocessing, same model and setting seed.
library(tidyverse)
library(tidymodels)
library(PKPDmisc)
library(mlr)
library(parallelMap)
library(survival)
# Data and Data Splitting
data = as_tibble(lung) %>%
mutate(status = if_else(status==1, 0, 1),
sex = factor(sex, levels = c(1:2), labels = c("male", "female")),
ph.ecog = factor(ph.ecog))
na <- sample(1:228, 228*0.1)
data$sex[na] <- NA
data$ph.ecog[na]<- NA
set.seed(123)
split <- data %>% initial_split(prop = 0.8, strata = status)
train <- split %>% training()
test <- split %>% testing()
# Task
task = makeSurvTask(id = "Survival", data = train, target = c("time", "status"))
# Resample
# For model assessment before external validation on test data
set.seed(123)
outer_cv = makeResampleDesc("CV", iter=4, stratify.cols = c("status")) %>%
makeResampleInstance(task)
# For feature selection and parameter tuning
set.seed(123)
inner_cv = makeResampleDesc("CV", iter=4, stratify.cols = c("status"))
# Learners
cox1 = makeLearner(id = "COX1", "surv.coxph") %>%
makeImputeWrapper(classes = list(factor = imputeMode(), numeric = imputeMedian()),
# Create dummy variable for factor features
dummy.classes = "factor") %>%
makePreprocWrapperCaret(ppc.center = TRUE, ppc.scale = TRUE) %>%
makeFeatSelWrapper(resampling = inner_cv, show.info = TRUE,
control = makeFeatSelControlSequential(method = "sfs"))
cox_lasso = makeLearner(id = "COX LASSO", "surv.glmnet") %>%
makeImputeWrapper(classes = list(factor = imputeMode(), numeric = imputeMedian()),
# Create dummy variable for factor features
dummy.classes = "factor") %>%
# Normalize numeric features
makePreprocWrapperCaret(ppc.center = TRUE, ppc.scale = TRUE) %>%
makeTuneWrapper(resampling = inner_cv, show.info = TRUE,
par.set = makeParamSet(makeNumericParam("lambda",lower = -3, upper = 0,
trafo = function(x) 10^x)),
control = makeTuneControlGrid(resolution = 10L))
cox_net = makeLearner(id = "COX NET", "surv.glmnet") %>%
makeImputeWrapper(classes = list(factor = imputeMode(), numeric = imputeMedian()),
# Create dummy variable for factor features
dummy.classes = "factor") %>%
# Normalize numeric features
makePreprocWrapperCaret(ppc.center = TRUE, ppc.scale = TRUE) %>%
makeTuneWrapper(resampling = inner_cv, show.info = TRUE,
par.set = makeParamSet(makeNumericParam("alpha", lower = 0, upper = 1,
trafo = function(x) round(x,2)),
makeNumericParam("lambda",lower = -3, upper = 1,
trafo = function(x) 10^x)),
control = makeTuneControlGrid(resolution = 10L))
# Benchmark
# parallelStartSocket(4)
start_time <- Sys.time()
set.seed(123)
mlr_bmr = benchmark(learners = list(cox1, cox_lasso, cox_net),
tasks = task,
resamplings = outer_cv,
keep.extract= TRUE,
models = TRUE)
end_time <- Sys.time()
mlr_time = end_time - start_time
# parallelStop()
mlr_res <- getBMRPerformances(mlr_bmr, as.df = TRUE) %>%
select(Learner = learner.id, Task = task.id, Cindex = cindex) %>%
mutate(Color_Package = "mlr",
Learner = word(str_replace(Learner, "\\.", " "), 1, -2))
##################################################################
library(mlr3verse)
# Task
task2 = TaskSurv$new(id = "Survival2", backend = train, time = "time", event = "status")
task2$col_roles$stratum = c("status")
# Resmaple
set.seed(123)
outer_cv2 = rsmp("cv", folds = 4)$instantiate(task2)
# For feature selection and parameter tuning
set.seed(123)
inner_cv2 = rsmp("cv", folds = 4)
# Learners
preproc = po("imputemedian", affect_columns = selector_type("numeric")) %>>%
po("imputemode", affect_columns = selector_type("factor")) %>>%
po("scale") %>>%
po("encode")
cox2 = AutoFSelector$new(learner = as_learner(preproc %>>%
lrn("surv.coxph")),
resampling = inner_cv2,
measure = msr("surv.cindex"),
terminator = trm("none"), # need to increase later
fselector = fs("sequential", strategy = "sfs")) # sfs is the default
cox2$id = "COX1"
cox_lasso2 = AutoTuner$new(learner = as_learner(preproc %>>%
lrn("surv.glmnet",
lambda = to_tune(p_dbl(lower = -3, upper = 0,
trafo = function(x) 10^x)))),
resampling = inner_cv2,
measure = msr("surv.cindex"),
terminator = trm("none"),
tuner = tnr("grid_search", resolution = 10))
cox_lasso2$id = "COX LASSO"
cox_net2 = AutoTuner$new(learner = as_learner(preproc %>>%
lrn("surv.glmnet",
alpha = to_tune(p_dbl(lower = 0, upper = 1)),
lambda = to_tune(p_dbl(lower = -3, upper = 1,
trafo = function(x) 10^x)))),
resampling = inner_cv2,
measure = msr("surv.cindex"),
terminator = trm("none"),
tuner = tnr("grid_search", resolution = 10))
cox_net2$id = "COX NET"
# Benchmark
desgin = benchmark_grid(tasks = task2,
learners = c(cox2, cox_lasso2, cox_net2),
resamplings = outer_cv2)
# future::plan("multisession")
# Error: Output type of PipeOp select during training (Task) incompatible with input type of PipeOp surv.coxph (TaskSurv)
start_time <- Sys.time()
set.seed(123)
mlr3_bmr = mlr3::benchmark(desgin)
end_time <- Sys.time()
mlr3_time = end_time - start_time
mlr3_res <- as.data.table(mlr3_bmr$score()) %>%
select(Task=task_id, Learner=learner_id, Cindex=surv.harrell_c) %>%
mutate(Color_Package = "mlr3")
mlr_res %>%
bind_rows(mlr3_res) %>%
ggplot(aes(Learner, Cindex, fill= Color_Package )) +
geom_boxplot(position=position_dodge(.8)) +
stat_summary(fun= mean, geom = "point", aes(group = Color_Package ),
position=position_dodge(.8), size = 3) +
labs(x="", y = " C-Index") +
theme_bw() + base_theme() + theme(legend.position = "top")

How do I add significance asterisks next to my values in a correlation matrix heat map?

I found this code online at http://www.sthda.com/english/wiki/ggplot2-quick-correlation-matrix-heatmap-r-software-and-data-visualization
It provides instructions for how to create a correlation matrix heat map and it works well. However, I was wondering how to get little stars * next to the values in the matrix that are significant. How would I go about doing that. Any help is greatly appreciated!!
mydata <- mtcars[, c(1,3,4,5,6,7)]
head(mydata)
cormat <- round(cor(mydata),2)
head(cormat)
library(reshape2)
melted_cormat <- melt(cormat)
head(melted_cormat)
library(ggplot2)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
library(reshape2)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Heatmap
library(ggplot2)
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
# Reorder the correlation matrix
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
# Print the heatmap
print(ggheatmap)
ggheatmap +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(0.6, 0.7),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5))
cor() doesn't show the significance level, you may have to use rcorr() from Hmisc package
This is quite similar to what you want (the graphic output is not so nice though)
library(ggplot2)
library(reshape2)
library(Hmisc)
library(stats)
abbreviateSTR <- function(value, prefix){ # format string more concisely
lst = c()
for (item in value) {
if (is.nan(item) || is.na(item)) { # if item is NaN return empty string
lst <- c(lst, '')
next
}
item <- round(item, 2) # round to two digits
if (item == 0) { # if rounding results in 0 clarify
item = '<.01'
}
item <- as.character(item)
item <- sub("(^[0])+", "", item) # remove leading 0: 0.05 -> .05
item <- sub("(^-[0])+", "-", item) # remove leading -0: -0.05 -> -.05
lst <- c(lst, paste(prefix, item, sep = ""))
}
return(lst)
}
d <- mtcars
cormatrix = rcorr(as.matrix(d), type='spearman')
cordata = melt(cormatrix$r)
cordata$labelr = abbreviateSTR(melt(cormatrix$r)$value, 'r')
cordata$labelP = abbreviateSTR(melt(cormatrix$P)$value, 'P')
cordata$label = paste(cordata$labelr, "\n",
cordata$labelP, sep = "")
cordata$strike = ""
cordata$strike[cormatrix$P > 0.05] = "X"
txtsize <- par('din')[2] / 2
ggplot(cordata, aes(x=Var1, y=Var2, fill=value)) + geom_tile() +
theme(axis.text.x = element_text(angle=90, hjust=TRUE)) +
xlab("") + ylab("") +
geom_text(label=cordata$label, size=txtsize) +
geom_text(label=cordata$strike, size=txtsize * 4, color="red", alpha=0.4)
Source
difference_p is the P_value of correlation matrix,
ax5 draws the sns.heatmap and return as ax5
data=correlation_p
for y in range(data.shape[0]):
for x in range(data.shape[1]):
if data[y,x]<0.1:
ax4.text(x + 0.5, y + 0.5, '-',size=48,
horizontalalignment='center',
verticalalignment='center',
)

How do I specify a random slope for a specific contrast in lme4?

With the following dataset...
Subj <- rep(1:10, each = 10)
Item <- rep(1:10, times = 10)
IV1 <- rep(1:5, times = 20)
DV <- rnorm(100)
library(data.table)
data <- as.data.table(cbind(Subj, Item, IV1, DV))
data$Subj <- as.factor(data$Subj)
data$Item <- as.factor(data$Item)
data$IV1 <- as.factor(data$IV1)
library(MASS)
contrasts(data$IV1) <- contr.sdif(5)
library(lme4)
m1 <- lmer(DV ~ IV1 + (1 + IV1|Subj) + (1|Item), data = data)
Now suppose that it turned out that there was only variance in the random subject slope for the contrast of IV1 level 2 vs. IV1 level1. Is it possible to fit a random slope only for this contrast?

Loop for regression over multiple factors

I am struggling to get a loop to run several regressions and store the coefficients and intercepts. I have a data similar as this:
data <- data.frame(y = rnorm(10), x1 = rnorm(10)*2, ID = c(rep(1,10), rep(2,10)), group = c(rep(3,5), rep(4,5)))
Where ID and group are factors, therefore:
data$ID <- as.factor(data$ID)
data$group <- as.factor(data$group)
So far I tried 2 approaches.
First I did the following:
for (i in unique(data$ID)){
for (j in unique(data$group)){
fit <- glm(y ~ x1, data=data[data$ID == i & data$group == j, ])
}
}
Afterwards I did the following:
myfun <- function(data) {
step(glm(y ~ x1, data = data), trace=0)
}
fcomb <- unique(data[,c("ID","group")])
mod <- list()
for(i in 1:nrow(fcomb)) {
mod <- c(mod,list(myfun(subset(data,ID==fcomb$ID[i] & group==fcomb$group[i]))))
}
In the end I would like to have a dataset in which for each ID and group I would have the intercept and the beta for the effect of x1 in y.
When I performed the second strategy I got something, but the betas and the intercepts are the same (which is totally impossible) and I still don't know how to store the values.
set.seed(1839)
data <- data.frame(
y = rnorm(10),
x1 = rnorm(10) * 2,
ID = c(rep(1, 10), rep(2, 10)),
group = c(rep(3, 5), rep(4, 5))
)
grid <- expand.grid(ID = unique(data$ID), group = unique(data$group))
results <- lapply(1:nrow(grid), function(x) {
lm(y ~ x1, data[data$ID == grid[x, 1] & data$group == grid[x, 2], ])$coef
})
results <- t(do.call(cbind, results))
results <- cbind(grid, results)
results
Returns:
ID group (Intercept) x1
1 1 3 -0.454072247 1.0295731
2 2 3 -0.454072247 1.0295731
3 1 4 0.007800405 -0.1832663
4 2 4 0.007800405 -0.1832663