First, I will give credit to the Soil Organic Carbon Mapping Cookbook for making the original R code available for use to the public.
In this paper, I did a little tweak to the R code so I could apply it to map dissolved oxygen using Random Forest (RF) and Support Vector Machine (SVM).
Here is the compiled R-code. You may need to tweak the code further, depending on your dataset.
install.packages(c("aqp", "automap", "car", "caret", "e1071", "GSIF", "htmlwidgets", "leaflet", "mapview", "Metrics", "openair", "plotKML", "psych", "quantregForest", "randomForest", "raster", "rasterVis", "reshape", "rgdal", "RSQLite", "snow", "soiltexture", "sf", "sp"))
shell("cls")
# Data-splitting for training and test dataset
library(caret)
dat <- read.csv(file = "data/site-level.csv")
train.ind <- createDataPartition(1:nrow(dat), p = .70, list = FALSE)
train <- dat[ train.ind,]
test <- dat[-train.ind,]
plot(density (log(train$DO)), col='red', main="")
lines(density(log(test$DO)), col='blue')
legend('topright', legend=c("train", "test"),
col=c("red", "blue"), lty=1, cex=1.5)
write.csv(train, file="data/dat_train.csv", row.names = FALSE)
write.csv(test, file="data/dat_test.csv", row.names = FALSE)
# Data Loading and Preprocessing
dat <- read.csv(file = "data/dat_train.csv ")
files <- list.files(path = "data/covs", pattern = "tif$", full.names = TRUE)
covs <- stack(files)
save(covs, file = " data/covariates.RData")
# Upgrade points data frame to SpatialPointsDataFrame. Extract values from covariates to the soil points
coordinates(dat) <- ~ X + Y
dat <- extract(x = covs, y = dat, sp = TRUE)
summary(dat@data)
dat <- as.data.frame(dat)
# Remove NA values. Export as a *.csv table
dat <- dat[complete.cases(dat),]
write.csv(dat, "data /MKD_RegMatrix.csv", row.names = FALSE)
# METHOD 1: RANDOM FOREST
# Load data
dat <- read.csv(file = "data/MKD_RegMatrix.csv")
str(dat)
library(sp)
coordinates(dat) <- ~ X + Y
class(dat)
# CRS of the point data and covariates must be the same
dat@proj4string <- CRS(projargs = "+init=epsg:4326")
dat@proj4string
load(file = "data/covariates.RData")
names(covs)
fm = as.formula(paste("log(DO) ~", paste0(names(covs[[-14]]),
collapse = "+")))
# Tuning parameters
library(randomForest)
library(caret)
ctrl <- trainControl(method = "cv", savePred=T)
rfmodel <- train(fm, data=dat@data, method = "rf", trControl = ctrl,
importance=TRUE)
rfmodel
# Variable importance plot, compare with the correlation matrix
varImpPlot(rfmodel[11][[1]])
plot(rfmodel[11][[1]])
pred <- predict(covs, rfmodel)
pred <- exp(pred)
writeRaster(pred, filename = "data/DO_rf.tif", overwrite=TRUE)
plot(pred)
# METHOD 2: SUPPORT VECTOR MACHINE
# Load data
dat <- read.csv("data/MKD_RegMatrix.csv")
library(sp)
coordinates(dat) <- ~ X + Y
class(dat)
# Import and load covariates
load(file = "data/covariates.RData")
names(covs)
names(dat@data)
selectedCovs <- cor(x = as.matrix(dat@data[,2]), y = as.matrix(dat@data[,-c(1:15)]))
selectedCovs
# Select the top five covariates
library(reshape)
x <- subset(melt(selectedCovs), value != 1 | value != NA)
x <- x[with(x, order(-abs(x$value))),]
idx <- as.character(x$X2[1:5])
COV <- covs[[idx]]
names(COV)
dat@data <- cbind(dat@data)
names(dat@data)
# Fitting a SVM model
library(e1071)
library(caret)
tuneResult <- tune(svm, DO ~., data = dat@data[,c("DO", names(COV))],
ranges = list(epsilon = seq(0.1,0.2,0.02),
cost = c(5,7,15,20)))
plot(tuneResult)
tunedModel <- tuneResult$best.model
print(tunedModel)
# Predict the DO
DOsvm <- predict(COV, tunedModel)
rmse <- function(errval)
{
val = sqrt(mean(errval^2))
return(val)
}
errval <- tunedModel$residuals # same as data$Y - predictedY
svr_RMSE <- rmse(errval)
print(paste('svr RMSE = ',
svr_RMSE))
writeRaster(DOsvm, filename = "data/DO_svm.tif", overwrite=TRUE)
plot(DOsvm)
# Evaluate the contribution of each covariate to the model
w <- t(tunedModel$coefs) %*% tunedModel$SV
w <- apply(w, 2, function(v){sqrt(sum(v^2))})
w <- sort(w, decreasing = T)
print(w)
# Validation
library(sp)
library(raster)
dat <- read.csv(file = "data/dat_test.csv")
coordinates(dat) <- ~ X + Y
DO_rf <- raster("data/DO_rf.tif")
DO_svm <- raster("data/DO_svm.tif")
dat <- extract(x = DO_rf, y = dat, sp = TRUE)
dat <- extract(x = DO_svm, y = dat, sp = TRUE)
dat$PE_rf <- dat$DO_rf - dat$DO
dat$PE_svm <- dat$DO_svm - dat$DO
write.csv(dat, "data /validation.csv", row.names = F)
# Estimating the map quality measures
ME_rf <- mean(dat$PE_rf, na.rm=TRUE)
ME_svm <- mean(dat$PE_svm, na.rm=TRUE)
MAE_rf <- mean(abs(dat$PE_rf), na.rm=TRUE)
MAE_svm <- mean(abs(dat$PE_svm), na.rm=TRUE)
MSE_rf <- mean(dat$PE_rf^2, na.rm=TRUE)
MSE_svm <- mean(dat$PE_svm^2, na.rm=TRUE)
RMSE_rf <- sqrt(sum(dat$PE_rf^2, na.rm=TRUE) / length(dat$PE_rf))
RMSE_svm <- sqrt(sum(dat$PE_svm^2, na.rm=TRUE) / length(dat$PE_svm))
AVE_rf <- 1 - sum(dat$PE_rf^2, na.rm=TRUE) /
sum( (dat$DO - mean(dat$DO, na.rm = TRUE))^2,
na.rm = TRUE)
AVE_svm <- 1 - sum(dat$PE_svm^2, na.rm=TRUE) /
sum( (dat$DO - mean(dat$DO, na.rm = TRUE))^2,
na.rm = TRUE)
# Graphical map quality measures
par(mfrow=c(2,2))
plot(dat$DO_rf, dat$DO, main="rf", xlab="predicted",
ylab='observed')
abline(0,1, lty=2, col='black')
abline(lm(dat$DO ~ dat$DO_rf), col = 'blue', lty=2)
plot(dat$DO_svm, dat$DO, main="svm", xlab="predicted",
ylab='observed')
abline(0,1, lty=2, col='black')
abline(lm(dat$DO ~ dat$DO_svm), col = 'blue', lty=2)
par(mfrow=c(1,1))
# Spatial bubbles for prediction errors
bubble(dat[!is.na(dat$PE_rf),], "PE_rf", pch = 21,
col=c('red', 'green'))
bubble(dat[!is.na(dat$PE_svm),], "PE_svm", pch = 21,
col=c('red', 'green'))
# Model correlations and spatial differences and evaluation
library(raster)
RF<-raster('data/DO_rf.tif')
SVM<-raster('data/DO_svm.tif')
models <- stack(RF, SVM)
# Compare the statistical distribution and correlation between the models
library(psych)
pairs.panels(na.omit(as.data.frame(models)),
# Correlation method
method = "pearson",
hist.col = "#00AFBB",
# Show density plots
density = TRUE,
# Show correlation ellipses
ellipses = TRUE
)
library(rasterVis)
library(mapview)
densityplot(models)
SD <- calc(models , sd)
mapview(SD)
RFSVM <- calc(models[[c(1,2)]], diff)
names(RFSVM) <- c('RFvsSVM')
X <- raster::cellStats(RFSVM, mean)
levelplot(RFSVM - X, at=seq(-0.5,0.5, length.out=10),
par.settings = RdBuTheme)
# Model evaluation
dat <- read.csv("data/validation.csv")
modRF <- data.frame(obs = dat$DO, mod = dat$DO_rf,
model = "RF")
modSVM <- data.frame(obs = dat$DO, mod = dat$DO_svm,
model = "SVM")
modData <- rbind(modRF, modSVM)
summary(modData)
# Calculate statistical evaluation metrics
library(openair)
modsts <- modStats(modData,obs = "obs", mod = "mod", type = "model")
modsts
TaylorDiagram(modData, obs = "obs", mod = "mod", group = "model",
cols = c("green", "yellow", "red","blue"),
cor.col='brown', rms.col='black')
conditionalQuantile(modData,obs = "obs", mod = "mod", type = "model")