Loop for regression over multiple factors - regression

I am struggling to get a loop to run several regressions and store the coefficients and intercepts. I have a data similar as this:
data <- data.frame(y = rnorm(10), x1 = rnorm(10)*2, ID = c(rep(1,10), rep(2,10)), group = c(rep(3,5), rep(4,5)))
Where ID and group are factors, therefore:
data$ID <- as.factor(data$ID)
data$group <- as.factor(data$group)
So far I tried 2 approaches.
First I did the following:
for (i in unique(data$ID)){
for (j in unique(data$group)){
fit <- glm(y ~ x1, data=data[data$ID == i & data$group == j, ])
}
}
Afterwards I did the following:
myfun <- function(data) {
step(glm(y ~ x1, data = data), trace=0)
}
fcomb <- unique(data[,c("ID","group")])
mod <- list()
for(i in 1:nrow(fcomb)) {
mod <- c(mod,list(myfun(subset(data,ID==fcomb$ID[i] & group==fcomb$group[i]))))
}
In the end I would like to have a dataset in which for each ID and group I would have the intercept and the beta for the effect of x1 in y.
When I performed the second strategy I got something, but the betas and the intercepts are the same (which is totally impossible) and I still don't know how to store the values.

set.seed(1839)
data <- data.frame(
y = rnorm(10),
x1 = rnorm(10) * 2,
ID = c(rep(1, 10), rep(2, 10)),
group = c(rep(3, 5), rep(4, 5))
)
grid <- expand.grid(ID = unique(data$ID), group = unique(data$group))
results <- lapply(1:nrow(grid), function(x) {
lm(y ~ x1, data[data$ID == grid[x, 1] & data$group == grid[x, 2], ])$coef
})
results <- t(do.call(cbind, results))
results <- cbind(grid, results)
results
Returns:
ID group (Intercept) x1
1 1 3 -0.454072247 1.0295731
2 2 3 -0.454072247 1.0295731
3 1 4 0.007800405 -0.1832663
4 2 4 0.007800405 -0.1832663

Related

R - specifying interaction contrasts for aov

How to specificy the contrasts (point estimates, 95CI and p-values) for the between-group differences of the within-group delta changes?
In the example below, I would be interest in the between-groups (group = 1 minus group = 2) of delta changes (time = 3 minus time = 1).
df and model:
demo3 <- read.csv("https://stats.idre.ucla.edu/stat/data/demo3.csv")
## Convert variables to factor
demo3 <- within(demo3, {
group <- factor(group)
time <- factor(time)
id <- factor(id)
})
par(cex = .6)
demo3$time <- as.factor(demo3$time)
demo3.aov <- aov(pulse ~ group * time + Error(id), data = demo3)
summary(demo3.aov)
Neither of these chunks of code achieve my goal, correct?
m2 <- emmeans(demo3.aov, "group", by = "time")
pairs(m2)
m22 <- emmeans(demo3.aov, c("group", "time") )
pairs(m22)
Look at the documentation for emmeans::contrast and in particular the argument interaction. If I understand your question correctly, you might want
summary(contrast(m22, interaction = c("pairwise", "dunnett")),
infer = c(TRUE, TRUE))
which would compute Dunnett-style contrasts for time (each time vs. time1), and compare those for group1 - group2. The summary(..., infer = c(TRUE, TRUE)) part overrides the default that tests but not CIs are shown.
You could also do this in stanges:
time.con <- contrast(m22, "dunnett", by = "group", name = "timediff")
summary(pairs(time.con, by = NULL), infer = c(TRUE, TRUE))
If you truly want just time 3 - time 1, then replace time.con with
time.con1 <- contrast(m22, list(`time3-time1` = c(-1, 0, 1, 0, 0))
(I don't know how many times you have. I assumed 5 in the above.)

Using optim to estimate 3-parameters log-normal by MLE

I need your help. I am having problems to estimate the parameters of the following data in R using optim. I am always getting a error message of the the following type:
Warning: NaNs producedError in optim(par = c(0, 1, -2), nll, method = "L-BFGS-B", upper = c(Inf, :
L-BFGS-B needs finite values of 'fn'
Could you please take a look and tell what i am doing wrong? Thanks
The code I am using is the following:
X <- rlnorm3(n=1000, meanlog = 0, sdlog = 1, threshold = -2)
nll <- function(theta) {
N <- length(X)
m <- theta[1]
s <- theta[2]
a <- theta[3]
e_1 <- log(X - a) - m
e_2 <- log(X - a)
nll <- -0.5 * N * log(2 * pi) - N * log(s) - (t(e_1) %*% e_1) / 2 * s ^ 2 - (t(e_2) %*% e_2)
return(-nll)
}
optim(par=c(0, 1, -2), nll, method="L-BFGS-B", upper = c(Inf, Inf, min(X)))

How do I specify a random slope for a specific contrast in lme4?

With the following dataset...
Subj <- rep(1:10, each = 10)
Item <- rep(1:10, times = 10)
IV1 <- rep(1:5, times = 20)
DV <- rnorm(100)
library(data.table)
data <- as.data.table(cbind(Subj, Item, IV1, DV))
data$Subj <- as.factor(data$Subj)
data$Item <- as.factor(data$Item)
data$IV1 <- as.factor(data$IV1)
library(MASS)
contrasts(data$IV1) <- contr.sdif(5)
library(lme4)
m1 <- lmer(DV ~ IV1 + (1 + IV1|Subj) + (1|Item), data = data)
Now suppose that it turned out that there was only variance in the random subject slope for the contrast of IV1 level 2 vs. IV1 level1. Is it possible to fit a random slope only for this contrast?

R dataframe with list of dictionaries as field

I have a data frame with a column called identifiers which contains product identifiers data as a string which is a list of dictionaries.
test_data <- data.frame(
identifiers = c(
"[{\"type\":\"ISBN\",\"value\":\"9781231027073\"}]",
"[{\"type\":\"EAN\",\"value\":\"5055266202847\"},{\"type\":\"EAN\",\"value\":\"4053162095984\"}]"),
id = c(1,2), stringsAsFactors = FALSE)
> test_data
identifiers id
1 [{"type":"ISBN","value":"9781231027073"}] 1
2 [{"type":"EAN","value":"5055266202847"},{"type":"EAN","value":"4053162095984"}] 2
What I would like to achieve is:
output_test_data <- data.frame(
type = c("ISBN", "EAN", "EAN"),
value = c("9781231027073","5055266202847","4053162095984"),
id = c(1,2,2), stringsAsFactors = FALSE)
> output_test_data
type value id
1 ISBN 9781231027073 1
2 EAN 5055266202847 2
3 EAN 4053162095984 2
The closest I got to the solution is to apply the fomJSON function from jsonlite.
jsonlite::fromJSON(test_data$identifiers[1])
or with a loop like this:
for (i in test_data$identifiers) {
print(jsonlite::fromJSON(i))
}
However I am struggling to:
1) get it applied to all rows.
2) preserve the information about id, from original data into the results.
Could anyone help with this?
You could do this:
df_result <- apply(test_data,1,function(x){
id_tmp <- x[2]
df_out <- jsonlite::fromJSON(x[1])
df_out$id <- id_tmp
return(df_out)
})
df_result <- do.call("rbind",df_result)

Predict sentiment score using multiclass logistic regression with R

I am trying to create a sentiment analysis classifier using logistic regression with R (glmnet).. Here is the R code :
library(tidyverse)
library(text2vec)
library(caret)
library(glmnet)
library(ggrepel)
Train_classifier <- read.csv('IRC.csv',header=T, sep=";")
Test_classifier <- read.csv('IRC2.csv',header=T, sep=";")
# select only 4 column of the dataframe
Train <- Train_classifier[, c("Note.Reco", "Raison.Reco", "DATE_SAISIE", "idpart")]
Test <- Test_classifier[, c("Note.Reco", "Raison.Reco", "DATE_SAISIE", "idpart")]
#delete rows with empty value columns
subTrain <- filter(Train, trimws(Raison.Reco)!=" ")
subTrain$ID <- seq.int(nrow(subTrain))
# # replacing class values
subTrain$Note.Reco = ifelse(subTrain$Note.Reco >= 0 & subTrain$Note.Reco <= 4, 0, ifelse(subTrain$Note.Reco >= 5 &
subTrain$Note.Reco <= 6, 1, ifelse(subTrain$Note.Reco >= 7 & subTrain$Note.Reco <= 8, 2, 3)))
subTest <- filter(Test, trimws(Raison.Reco)!=" ")
subTest$ID <- seq.int(nrow(subTest))
#Data pre processing
#Doc2Vec
prep_fun <- tolower
tok_fun <- word_tokenizer
subTrain[] <- lapply(subTrain, as.character)
it_train <- itoken(subTrain$Raison.Reco,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = subTrain$ID,
progressbar = TRUE)
subTest[] <- lapply(subTest, as.character)
it_test <- itoken(subTest$Raison.Reco,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = subTest$ID,
progressbar = TRUE)
#creation of vocabulairy and term document matrix
### fichier d'apprentissage
vocab_train <- create_vocabulary(it_train)
vectorizer_train <- vocab_vectorizer(vocab_train)
dtm_train <- create_dtm(it_train, vectorizer)
### test data
vocab_test <- create_vocabulary(it_test)
vectorizer_test <- vocab_vectorizer(vocab_test)
dtm_test <- create_dtm(it_test, vectorizer_test)
##Define tf-idf model
tfidf <- TfIdf$new()
# fit the model to the train data and transform it with the fitted model
dtm_train_tfidf <- fit_transform(dtm_train, tfidf)
dtm_test_tfidf <- fit_transform(dtm_test, tfidf)
glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf,
y = subTrain[['Note.Reco']],
family = 'multinomial',
# L1 penalty
alpha = 1,
# interested in the area under ROC curve
type.measure = "auc",
# 5-fold cross-validation
nfolds = 5,
# high value is less accurate, but has faster training
thresh = 1e-3,
# again lower number of iterations for faster training
maxit = 1e3)
plot(glmnet_classifier)
Here is the struct of the data subTrain :
[![Note.Reco Raison.Reco DATE_SAISIE idpart ID
3 Good service 19/03/2014 56992
2 good stuff 19/03/2014 53645
8 very nice 20/02/2016 261392
...][1]][1]
I get this plot (attached file) Can you explain me more if it is true Thank you