Difference in Computation Speed and Results Between MLR and MLR3 - mlr

I don't get similar results when I use the same data and models using mlr and mlr3. Also I find mlr runs at least 20-fold faster. I used lung data from survival and I was able to replicate the difference in computation speed and results since I can't share my data.
mlr was completed in 1 min with C-index generally low compared to mlr3 that took 21 min to complete with C-index being much higher despite using same data, same preprocessing, same model and setting seed.
library(tidyverse)
library(tidymodels)
library(PKPDmisc)
library(mlr)
library(parallelMap)
library(survival)
# Data and Data Splitting
data = as_tibble(lung) %>%
mutate(status = if_else(status==1, 0, 1),
sex = factor(sex, levels = c(1:2), labels = c("male", "female")),
ph.ecog = factor(ph.ecog))
na <- sample(1:228, 228*0.1)
data$sex[na] <- NA
data$ph.ecog[na]<- NA
set.seed(123)
split <- data %>% initial_split(prop = 0.8, strata = status)
train <- split %>% training()
test <- split %>% testing()
# Task
task = makeSurvTask(id = "Survival", data = train, target = c("time", "status"))
# Resample
# For model assessment before external validation on test data
set.seed(123)
outer_cv = makeResampleDesc("CV", iter=4, stratify.cols = c("status")) %>%
makeResampleInstance(task)
# For feature selection and parameter tuning
set.seed(123)
inner_cv = makeResampleDesc("CV", iter=4, stratify.cols = c("status"))
# Learners
cox1 = makeLearner(id = "COX1", "surv.coxph") %>%
makeImputeWrapper(classes = list(factor = imputeMode(), numeric = imputeMedian()),
# Create dummy variable for factor features
dummy.classes = "factor") %>%
makePreprocWrapperCaret(ppc.center = TRUE, ppc.scale = TRUE) %>%
makeFeatSelWrapper(resampling = inner_cv, show.info = TRUE,
control = makeFeatSelControlSequential(method = "sfs"))
cox_lasso = makeLearner(id = "COX LASSO", "surv.glmnet") %>%
makeImputeWrapper(classes = list(factor = imputeMode(), numeric = imputeMedian()),
# Create dummy variable for factor features
dummy.classes = "factor") %>%
# Normalize numeric features
makePreprocWrapperCaret(ppc.center = TRUE, ppc.scale = TRUE) %>%
makeTuneWrapper(resampling = inner_cv, show.info = TRUE,
par.set = makeParamSet(makeNumericParam("lambda",lower = -3, upper = 0,
trafo = function(x) 10^x)),
control = makeTuneControlGrid(resolution = 10L))
cox_net = makeLearner(id = "COX NET", "surv.glmnet") %>%
makeImputeWrapper(classes = list(factor = imputeMode(), numeric = imputeMedian()),
# Create dummy variable for factor features
dummy.classes = "factor") %>%
# Normalize numeric features
makePreprocWrapperCaret(ppc.center = TRUE, ppc.scale = TRUE) %>%
makeTuneWrapper(resampling = inner_cv, show.info = TRUE,
par.set = makeParamSet(makeNumericParam("alpha", lower = 0, upper = 1,
trafo = function(x) round(x,2)),
makeNumericParam("lambda",lower = -3, upper = 1,
trafo = function(x) 10^x)),
control = makeTuneControlGrid(resolution = 10L))
# Benchmark
# parallelStartSocket(4)
start_time <- Sys.time()
set.seed(123)
mlr_bmr = benchmark(learners = list(cox1, cox_lasso, cox_net),
tasks = task,
resamplings = outer_cv,
keep.extract= TRUE,
models = TRUE)
end_time <- Sys.time()
mlr_time = end_time - start_time
# parallelStop()
mlr_res <- getBMRPerformances(mlr_bmr, as.df = TRUE) %>%
select(Learner = learner.id, Task = task.id, Cindex = cindex) %>%
mutate(Color_Package = "mlr",
Learner = word(str_replace(Learner, "\\.", " "), 1, -2))
##################################################################
library(mlr3verse)
# Task
task2 = TaskSurv$new(id = "Survival2", backend = train, time = "time", event = "status")
task2$col_roles$stratum = c("status")
# Resmaple
set.seed(123)
outer_cv2 = rsmp("cv", folds = 4)$instantiate(task2)
# For feature selection and parameter tuning
set.seed(123)
inner_cv2 = rsmp("cv", folds = 4)
# Learners
preproc = po("imputemedian", affect_columns = selector_type("numeric")) %>>%
po("imputemode", affect_columns = selector_type("factor")) %>>%
po("scale") %>>%
po("encode")
cox2 = AutoFSelector$new(learner = as_learner(preproc %>>%
lrn("surv.coxph")),
resampling = inner_cv2,
measure = msr("surv.cindex"),
terminator = trm("none"), # need to increase later
fselector = fs("sequential", strategy = "sfs")) # sfs is the default
cox2$id = "COX1"
cox_lasso2 = AutoTuner$new(learner = as_learner(preproc %>>%
lrn("surv.glmnet",
lambda = to_tune(p_dbl(lower = -3, upper = 0,
trafo = function(x) 10^x)))),
resampling = inner_cv2,
measure = msr("surv.cindex"),
terminator = trm("none"),
tuner = tnr("grid_search", resolution = 10))
cox_lasso2$id = "COX LASSO"
cox_net2 = AutoTuner$new(learner = as_learner(preproc %>>%
lrn("surv.glmnet",
alpha = to_tune(p_dbl(lower = 0, upper = 1)),
lambda = to_tune(p_dbl(lower = -3, upper = 1,
trafo = function(x) 10^x)))),
resampling = inner_cv2,
measure = msr("surv.cindex"),
terminator = trm("none"),
tuner = tnr("grid_search", resolution = 10))
cox_net2$id = "COX NET"
# Benchmark
desgin = benchmark_grid(tasks = task2,
learners = c(cox2, cox_lasso2, cox_net2),
resamplings = outer_cv2)
# future::plan("multisession")
# Error: Output type of PipeOp select during training (Task) incompatible with input type of PipeOp surv.coxph (TaskSurv)
start_time <- Sys.time()
set.seed(123)
mlr3_bmr = mlr3::benchmark(desgin)
end_time <- Sys.time()
mlr3_time = end_time - start_time
mlr3_res <- as.data.table(mlr3_bmr$score()) %>%
select(Task=task_id, Learner=learner_id, Cindex=surv.harrell_c) %>%
mutate(Color_Package = "mlr3")
mlr_res %>%
bind_rows(mlr3_res) %>%
ggplot(aes(Learner, Cindex, fill= Color_Package )) +
geom_boxplot(position=position_dodge(.8)) +
stat_summary(fun= mean, geom = "point", aes(group = Color_Package ),
position=position_dodge(.8), size = 3) +
labs(x="", y = " C-Index") +
theme_bw() + base_theme() + theme(legend.position = "top")

Related

VIPER: some of genes in the pathway gene list are missing in the result

I have a problem when I use VIPER with aracne networks.
I used regulonbrca in aracne to calculate protein activity,
The number of genes in regulonbrca is 6054.
And the number of genes in regulonbrca also in my TPM file is 5918.
But I got only 4506 genes as a viper result.
Why some genes are missing when running viper? Is there a default setting about it?
Here is my code.
expdat= "TPM-brca(entrez_avg).csv"
clsdat= "immunesubtype-BRCA.csv"
regulon= regulonbrca
dat0 = data.frame(read.csv(expdat))
dat1 = dat0[,-c(1,2)]
rownames(dat1) = dat0[,1]
cls0 = data.frame(read.csv(clsdat))
colnames(cls0) = c("id", "description")
cls1 = cls0 %>% dplyr::filter(id %in% colnames(dat1))
dat = dat1[,cls1$id]
cls = data.frame(description=cls1[,2])
rownames(cls) = cls1[,1]
meta = data.frame(labelDescription=c("description"), row.names = colnames(cls))
pheno = new("AnnotatedDataFrame", data=cls, varMetadata=meta)
dset = ExpressionSet(assayData = as.matrix(dat), phenoData = pheno)
signature = rowTtest(dset, "description", "TRUE", "FALSE")
signature = (qnorm(signature$p.value/2, lower.tail = FALSE) * + sign(signature$statistic))[, 1]
nullmodel = ttestNull(dset, "description", "TRUE", "FALSE", per = 1000, repos = TRUE, verbose = FALSE)
vpres = viper(dset, regulon, verbose = FALSE) ## single sample viper
res_ss0 = data.frame(id=row.names(vpres#assayData$exprs), vpres#assayData$exprs)
geneid_ss0 = ldply(sapply(row.names(res_ss0), converter), data.frame) ## gene mapping
colnames(geneid_ss0) = c('id', 'gene')
res_ss = merge(geneid_ss0, res_ss0, key='id')[, -1]

Plot and table in one figure in R markdown for HTML output

I'm working in Rbookdown and I want to place a plot and a table in one figure, how can I achieve that? Below is the code i used so far. Can you help?
```{r echo=FALSE, message=FALSE, warning=FALSE, fig.height = 3.5, out.width = '70%', fig.align = "center"}
library(knitr)
library(kableExtra)
library(tidyverse)
library(latex2exp)
options(scipen=999)
mu = 0
sigma = 1
x = 1
# draw normal distribution
range = seq(mu - 4*sigma, mu + 4*sigma, 0.01)
y = dnorm(range, mu, sigma)
plot(range, y,
main = "Standard Normal Distribution", xlab = "Z-score", ylab = " ",
type = 'l', ylim = c(0, max(y) + 0.01), axes = FALSE)
axis(1, at = seq(mu - 4*sigma, mu + 4*sigma, sigma))
# Add area to the left of x
cord.a = c(0, seq(min(range), x, 0.01))
cord.b = c(dnorm(seq(min(range), x, 0.01), mu, sigma), 0)
polygon(cord.a, cord.b, col = "#61a5ff")
text(x = 1.1, y = -.06, TeX('$z = 1.00$'), cex = .8, xpd=NA)
text(x = 0, y = .15, TeX('$p = .8413$'), cex = .8, xpd=NA)
# Create standard normal table
options(digits = 4)
u=seq(0,3.09,by=0.01)
p=pnorm(u)
m=matrix(p,ncol=10,byrow=TRUE)
df.m = as.data.frame(m)
z.values = c("**0.0**", "**0.1**", "**0.2**", "**0.3**", "**0.4**", "**0.5**", "**0.6**",
"**0.7**", "**0.8**", "**0.9**", "**1.0**", "**1.1**", "**1.2**", "**1.3**",
"**1.4**", "**1.5**", "**1.6**", "**1.7**", "**1.8**", "**1.9**","**2.0**",
"**2.1**", "**2.2**", "**2.3**", "**2.4**", "**2.5**", "**2.6**", "**2.7**",
"**2.8**", "**2.9**", "**3.0**")
df.z.values = as.data.frame(z.values)
new.m = df.z.values %>%
bind_cols(df.m)
kable(new.m,
booktabs = TRUE,
col.names = c("$Z$", "0.00","0.01", "0.02", "0.03", "0.04",
"0.05", "0.06", "0.07", "0.08", "0.09"),
escape = FALSE,
caption = "Standaard Normaalverdeling",
linesep = "",
align = c('r')) %>%
kable_styling(font_size = 10)
Try this solution:
```{r echo=FALSE, message=FALSE, warning=FALSE, include = FALSE}
library(kableExtra)
#make and save our table into working directory
table1 <- head(mtcars[1:5]) %>%
kbl() %>%
kable_styling(full_width = F) %>%
save_kable("tab_kbl.png")
#make and save our plot into working directory
png('norm_pl.png')
plot(rnorm(10))
dev.off()
```
```{r,echo=FALSE, message=FALSE, warning=FALSE, fig.cap="My image", fig.align = "center"}
library(cowplot)
#combine our images in the one
img1 <- ggdraw() + draw_image("norm_pl.png", scale = 1)
img2 <- ggdraw() + draw_image("tab_kbl.png", scale = 1)
plot_grid(img1, img2)
```
An another variant
```{r, fig.align='center', fig.cap="My beautiful image"}
library(gridExtra)
library(grid)
library(cowplot)
t1 <- tableGrob(head(mtcars[1:5]), theme = ttheme_minimal())
p2 <- ggplot(mtcars, aes(cyl, mpg)) +
geom_point()
plot_grid(t1, p2, ncol = 2, rel_widths = c(2,1))
```

How do I add significance asterisks next to my values in a correlation matrix heat map?

I found this code online at http://www.sthda.com/english/wiki/ggplot2-quick-correlation-matrix-heatmap-r-software-and-data-visualization
It provides instructions for how to create a correlation matrix heat map and it works well. However, I was wondering how to get little stars * next to the values in the matrix that are significant. How would I go about doing that. Any help is greatly appreciated!!
mydata <- mtcars[, c(1,3,4,5,6,7)]
head(mydata)
cormat <- round(cor(mydata),2)
head(cormat)
library(reshape2)
melted_cormat <- melt(cormat)
head(melted_cormat)
library(ggplot2)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
library(reshape2)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Heatmap
library(ggplot2)
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
# Reorder the correlation matrix
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
# Print the heatmap
print(ggheatmap)
ggheatmap +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(0.6, 0.7),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5))
cor() doesn't show the significance level, you may have to use rcorr() from Hmisc package
This is quite similar to what you want (the graphic output is not so nice though)
library(ggplot2)
library(reshape2)
library(Hmisc)
library(stats)
abbreviateSTR <- function(value, prefix){ # format string more concisely
lst = c()
for (item in value) {
if (is.nan(item) || is.na(item)) { # if item is NaN return empty string
lst <- c(lst, '')
next
}
item <- round(item, 2) # round to two digits
if (item == 0) { # if rounding results in 0 clarify
item = '<.01'
}
item <- as.character(item)
item <- sub("(^[0])+", "", item) # remove leading 0: 0.05 -> .05
item <- sub("(^-[0])+", "-", item) # remove leading -0: -0.05 -> -.05
lst <- c(lst, paste(prefix, item, sep = ""))
}
return(lst)
}
d <- mtcars
cormatrix = rcorr(as.matrix(d), type='spearman')
cordata = melt(cormatrix$r)
cordata$labelr = abbreviateSTR(melt(cormatrix$r)$value, 'r')
cordata$labelP = abbreviateSTR(melt(cormatrix$P)$value, 'P')
cordata$label = paste(cordata$labelr, "\n",
cordata$labelP, sep = "")
cordata$strike = ""
cordata$strike[cormatrix$P > 0.05] = "X"
txtsize <- par('din')[2] / 2
ggplot(cordata, aes(x=Var1, y=Var2, fill=value)) + geom_tile() +
theme(axis.text.x = element_text(angle=90, hjust=TRUE)) +
xlab("") + ylab("") +
geom_text(label=cordata$label, size=txtsize) +
geom_text(label=cordata$strike, size=txtsize * 4, color="red", alpha=0.4)
Source
difference_p is the P_value of correlation matrix,
ax5 draws the sns.heatmap and return as ax5
data=correlation_p
for y in range(data.shape[0]):
for x in range(data.shape[1]):
if data[y,x]<0.1:
ax4.text(x + 0.5, y + 0.5, '-',size=48,
horizontalalignment='center',
verticalalignment='center',
)

R leaflet display the polygon label by default

I am new to the leaflet package.
I am trying to draw two types of polygons and let the user select them and see the borders. These polygons have labels and I want to display them by default. At the moment the labels are displayed only on mouse hover.
Basically what I want is to let the user search for the polygon label on the map.
Given below is my code.
shp <- readOGR(dsn = 'shapes'
,layer = 'SAB')
postcode <- readOGR(dsn = 'shapes'
,layer = 'Postcode')
CRS_WGS84 <- '+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0'
t_shp <- spTransform(shp, CRS(CRS_WGS84))
sab_shp <- raster::aggregate(t_shp, by='SMALL_AREA')
dat <- data.table(shp#data)
sabLabels <- sprintf('<strong>SAB: %s', t_shp$SMALL_AREA) %>% lapply(HTML)
postcode <- readOGR(dsn = 'shapes'
,layer = 'Postcode')
CRS_WGS84 <- '+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0'
t_shp2 <- spTransform(postcode, CRS(CRS_WGS84))
postcode_shp <- raster::aggregate(t_shp2, by='RoutingKey')
dat2 <- data.table(postcode#data)
postcodeLabels <- sprintf('<strong>SAB: %s', t_shp2$RoutingKey) %>% lapply(HTML)
leaflet() %>%
addTiles() %>% #using default does not allow html export to include the underlying
#OSM layer
addProviderTiles('OpenStreetMap.Mapnik') %>%
addPolygons( data = t_shp
,stroke = T
,fillColor = 'grey'
,fillOpacity = 0.2
,color = 'blue'
,weight = 0.5
,label = sabLabels
,group = 'SABS'
,highlightOptions = highlightOptions(color = "blue", weight = 7,
bringToFront = TRUE)
#,labelOptions = labelOptions(noHide = TRUE, textOnly = TRUE, opacity = 0.5 , textsize='15px')
) %>%
addPolygons( data = t_shp2
,stroke = T
,fillOpacity = 0
,color = 'black'
,weight = 1.5
,label = postcodeLabels
,group = 'PostCodes'
) %>%
addLayersControl(
overlayGroups = c(
'SABS'
,'PostCodes'
)
,options = layersControlOptions((collapsed = F))
)

Predict sentiment score using multiclass logistic regression with R

I am trying to create a sentiment analysis classifier using logistic regression with R (glmnet).. Here is the R code :
library(tidyverse)
library(text2vec)
library(caret)
library(glmnet)
library(ggrepel)
Train_classifier <- read.csv('IRC.csv',header=T, sep=";")
Test_classifier <- read.csv('IRC2.csv',header=T, sep=";")
# select only 4 column of the dataframe
Train <- Train_classifier[, c("Note.Reco", "Raison.Reco", "DATE_SAISIE", "idpart")]
Test <- Test_classifier[, c("Note.Reco", "Raison.Reco", "DATE_SAISIE", "idpart")]
#delete rows with empty value columns
subTrain <- filter(Train, trimws(Raison.Reco)!=" ")
subTrain$ID <- seq.int(nrow(subTrain))
# # replacing class values
subTrain$Note.Reco = ifelse(subTrain$Note.Reco >= 0 & subTrain$Note.Reco <= 4, 0, ifelse(subTrain$Note.Reco >= 5 &
subTrain$Note.Reco <= 6, 1, ifelse(subTrain$Note.Reco >= 7 & subTrain$Note.Reco <= 8, 2, 3)))
subTest <- filter(Test, trimws(Raison.Reco)!=" ")
subTest$ID <- seq.int(nrow(subTest))
#Data pre processing
#Doc2Vec
prep_fun <- tolower
tok_fun <- word_tokenizer
subTrain[] <- lapply(subTrain, as.character)
it_train <- itoken(subTrain$Raison.Reco,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = subTrain$ID,
progressbar = TRUE)
subTest[] <- lapply(subTest, as.character)
it_test <- itoken(subTest$Raison.Reco,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = subTest$ID,
progressbar = TRUE)
#creation of vocabulairy and term document matrix
### fichier d'apprentissage
vocab_train <- create_vocabulary(it_train)
vectorizer_train <- vocab_vectorizer(vocab_train)
dtm_train <- create_dtm(it_train, vectorizer)
### test data
vocab_test <- create_vocabulary(it_test)
vectorizer_test <- vocab_vectorizer(vocab_test)
dtm_test <- create_dtm(it_test, vectorizer_test)
##Define tf-idf model
tfidf <- TfIdf$new()
# fit the model to the train data and transform it with the fitted model
dtm_train_tfidf <- fit_transform(dtm_train, tfidf)
dtm_test_tfidf <- fit_transform(dtm_test, tfidf)
glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf,
y = subTrain[['Note.Reco']],
family = 'multinomial',
# L1 penalty
alpha = 1,
# interested in the area under ROC curve
type.measure = "auc",
# 5-fold cross-validation
nfolds = 5,
# high value is less accurate, but has faster training
thresh = 1e-3,
# again lower number of iterations for faster training
maxit = 1e3)
plot(glmnet_classifier)
Here is the struct of the data subTrain :
[![Note.Reco Raison.Reco DATE_SAISIE idpart ID
3 Good service 19/03/2014 56992
2 good stuff 19/03/2014 53645
8 very nice 20/02/2016 261392
...][1]][1]
I get this plot (attached file) Can you explain me more if it is true Thank you