In September 2022, of this year, Stepehn Neumann created a benchmark for moecular weight calculation that he announced on twitter showing that rCDK had dismal performance relative to other tools in the R ecosystem. Something seemed a bit off so I looked into the code.
What I discovered is that the mass spec calculations were mediated by R classes instead of accessing the underlying Java code directly and if you write a function that does that you get a speedup, and if you avoid reflection by creating static calls then you get a really fast function.
This is a good example of how to think about using Java from within R
where the use of the helper rJava functions, J
and
$
allow you to prototype code and benefit from reflection
so you can code sort of like you would in R. Then, if you need
performance, you can tighten down the code a bit by making the calls
static. You can see that progression in the code below which is
accompanied by the outputs from those benchmarks.
will give (2/3) runtime in µs:
21 OrgMassSpecR
163 MetaboCoreUtils
197 enviPat
545 Rdisop
645 CHNOSZ
4863 ChemmineR
22510 rcdk
https://gist.github.com/sneumann/959a6d205ea4ac73eaf1393da0ec0673 ## Benchmark
# Bioconductor Packages. Use BiocManager::install()
# Rdisop MetaboCoreUtils ChemmineR ChemmineOB enviPat
library(plyr)
library(CHNOSZ)
library(enviPat)
library(MetaboCoreUtils)
library(rcdk)
library(ChemmineR)
library(OrgMassSpecR)
library(Rdisop)
#library(ChemmineOB)
data(isotopes)
# original
# https://github.com/CDK-R/cdkr/blob/master/rcdk/R/formula.R
# get.formula <- function(mf, charge=0) {
#
# manipulator <- get("mfManipulator", envir = .rcdk.GlobalEnv)
# if(!is.character(mf)) {
# stop("Must supply a Formula string");
# }else{
# dcob <- .cdkFormula.createChemObject()
# molecularformula <- .cdkFormula.createFormulaObject()
# molecularFormula <- .jcall(manipulator,
# "Lorg/openscience/cdk/interfaces/IMolecularFormula;",
# "getMolecularFormula",
# mf,
# .jcast(molecularformula,.IMolecularFormula),
# TRUE);
# }
#
# D <- new(J("java/lang/Integer"), as.integer(charge))
# .jcall(molecularFormula,"V","setCharge",D);
# object <- .cdkFormula.createObject(.jcast(molecularFormula,.IMolecularFormula));
# return(object);
# }
mfManipulator <- J("org/openscience/cdk/tools/manipulator/MolecularFormulaManipulator")
silentchemobject <- J("org.openscience.cdk.silent.SilentChemObjectBuilder")
#' Rewrite the formual object and directly access Java
#'
get.formula2 <- function(mf) {
formula <- mfManipulator$getMolecularFormula(
"C2H3",
silentchemobject$getInstance())
mfManipulator$getMass(formula)
}
#' Add type hints
#'
get.formula3 <- function(mf) {
builderinstance <- .jcall(
silentchemobject,
"Lorg/openscience/cdk/interfaces/IChemObjectBuilder;",
"getInstance")
formula <- .jcall(
mfManipulator,
"Lorg/openscience/cdk/interfaces/IMolecularFormula;",
"getMolecularFormula",
mf,
builderinstance);
mfManipulator$getMass(formula)
}
#' Add type hints
#'
get.formula4 <- function(mf) {
builderinstance <- .jcall(
silentchemobject,
"Lorg/openscience/cdk/interfaces/IChemObjectBuilder;",
"getInstance")
formula <- .jcall(
mfManipulator,
"Lorg/openscience/cdk/interfaces/IMolecularFormula;",
"getMolecularFormula",
mf,
builderinstance);
.jcall(
mfManipulator,
"D",
"getMass",
formula)
}
benchmark <- microbenchmark::microbenchmark(
MetaboCoreUtils = MetaboCoreUtils::calculateMass("C2H6O"),
rcdk = rcdk::get.formula("C2H6O", charge = 0)@mass,
rcdk2 = get.formula2("C2H6O"),
rcdk3 = get.formula3("C2H6O"),
rcdk4 = get.formula4("C2H6O"),
Rdisop = Rdisop::getMolecule("C2H6O")$exactmass,
ChemmineR = ChemmineR::exactMassOB(ChemmineR::smiles2sdf("CCO")),
OrgMassSpecR = OrgMassSpecR::MonoisotopicMass(formula = OrgMassSpecR::ListFormula("C2H6O)"), charge = 0),
CHNOSZ = CHNOSZ::mass("C2H6O"),
enviPat = enviPat::isopattern(isotopes, "C2H6O", charge=FALSE, verbose=FALSE)[[1]][1,1]
, times=1000L)
masses <- c(
MetaboCoreUtils=MetaboCoreUtils::calculateMass("C2H6O"),
rcdk=rcdk::get.formula("C2H6O", charge = 0)@mass,
Rdisop=Rdisop::getMolecule("C2H6O")$exactmass,
#ChemmineR=ChemmineR::exactMassOB(ChemmineR::smiles2sdf("CCO")),
OrgMassSpecR=OrgMassSpecR::MonoisotopicMass(formula = OrgMassSpecR::ListFormula("C2H6O)"), charge = 0),
CHNOSZ=CHNOSZ::mass("C2H6O"),
enviPat=enviPat::isopattern(isotopes, "C2H6O", charge=FALSE, verbose=FALSE)[[1]][1,1]
)
options(digits=10)
t(t(sort(masses)))
summary(benchmark)[order(summary(benchmark)[,"median"]) , ]
clipr::write_clip(as.data.frame(summary(benchmark)[order(summary(benchmark)[,"median"]) , ] ))
expr min lq mean median uq
1 MetaboCoreUtils 69.479 122.8465 154.049427 139.6495 156.2700
10 enviPat 83.250 143.0935 170.429197 160.5360 179.6570
5 rcdk4 175.889 228.8605 324.182735 271.2955 327.7135
8 OrgMassSpecR 249.287 333.3135 392.479869 357.6665 401.5585
6 Rdisop 382.417 459.8790 538.068697 490.1505 557.9975
9 CHNOSZ 355.145 510.2910 588.186951 555.9165 632.2060
4 rcdk3 781.987 1004.7160 1294.507318 1133.3415 1339.4695
3 rcdk2 2078.465 2392.4950 2920.601088 2612.8025 2931.5465
7 ChemmineR 3227.320 3790.0455 4808.783873 4044.1410 4465.1000
2 rcdk 14823.815 16456.7715 19088.569430 17485.0800 19468.7195