R Code for Mapping Dissolved Oxygen Using Random Forest (RF) and Support Vector Machine (SVM)

First, I will give credit to the Soil Organic Carbon Mapping Cookbook for making the original R code available for use to the public.

In this paper, I did a little tweak to the R code so I could apply it to map dissolved oxygen using Random Forest (RF) and Support Vector Machine (SVM).

Here is the compiled R-code. You may need to tweak the code further, depending on your dataset.

install.packages(c("aqp", "automap", "car", "caret", "e1071", "GSIF", "htmlwidgets", "leaflet", "mapview", "Metrics", "openair", "plotKML", "psych", "quantregForest", "randomForest", "raster", "rasterVis", "reshape", "rgdal", "RSQLite", "snow", "soiltexture", "sf", "sp"))

shell("cls")

# Data-splitting for training and test dataset

library(caret)
dat <- read.csv(file = "data/site-level.csv")
train.ind <- createDataPartition(1:nrow(dat), p = .70, list = FALSE)
train <- dat[ train.ind,]
test  <- dat[-train.ind,]

plot(density (log(train$DO)), col='red', main="")
lines(density(log(test$DO)), col='blue')
legend('topright', legend=c("train", "test"),
       col=c("red", "blue"), lty=1, cex=1.5)

write.csv(train, file="data/dat_train.csv", row.names = FALSE)
write.csv(test, file="data/dat_test.csv", row.names = FALSE)

# Data Loading and Preprocessing

dat <- read.csv(file = "data/dat_train.csv ")
files <- list.files(path = "data/covs", pattern = "tif$", full.names = TRUE)
covs <- stack(files)
save(covs, file = " data/covariates.RData")

# Upgrade points data frame to SpatialPointsDataFrame. Extract values from covariates to the soil points

coordinates(dat) <- ~ X + Y
dat <- extract(x = covs, y = dat, sp = TRUE)
summary(dat@data)
dat <- as.data.frame(dat)

# Remove NA values. Export as a *.csv table

dat <- dat[complete.cases(dat),]
write.csv(dat, "data /MKD_RegMatrix.csv", row.names = FALSE)



# METHOD 1: RANDOM FOREST
# Load data

dat <- read.csv(file = "data/MKD_RegMatrix.csv")
str(dat)
library(sp)
coordinates(dat) <- ~ X + Y
class(dat)

# CRS of the point data and covariates must be the same

dat@proj4string <- CRS(projargs = "+init=epsg:4326")
dat@proj4string
load(file = "data/covariates.RData")
names(covs)
fm = as.formula(paste("log(DO) ~", paste0(names(covs[[-14]]),
                                          collapse = "+"))) 
# Tuning parameters

library(randomForest)
library(caret)
ctrl <- trainControl(method = "cv", savePred=T)
rfmodel <- train(fm, data=dat@data, method = "rf", trControl = ctrl, 
                 importance=TRUE)
rfmodel

# Variable importance plot, compare with the correlation matrix

varImpPlot(rfmodel[11][[1]])
plot(rfmodel[11][[1]])
pred <- predict(covs, rfmodel)
pred <- exp(pred)
writeRaster(pred, filename = "data/DO_rf.tif", overwrite=TRUE)
plot(pred)

# METHOD 2: SUPPORT VECTOR MACHINE
# Load data

dat <- read.csv("data/MKD_RegMatrix.csv")
library(sp)
coordinates(dat) <- ~ X + Y
class(dat)

# Import and load covariates

load(file = "data/covariates.RData")
names(covs)
names(dat@data)
selectedCovs <- cor(x = as.matrix(dat@data[,2]), y = as.matrix(dat@data[,-c(1:15)]))
selectedCovs

# Select the top five covariates

library(reshape)
x <- subset(melt(selectedCovs), value != 1 | value != NA)
x <- x[with(x, order(-abs(x$value))),]
idx <- as.character(x$X2[1:5])
COV <- covs[[idx]]
names(COV)
dat@data <- cbind(dat@data)
names(dat@data)

# Fitting a SVM model

library(e1071)
library(caret)
tuneResult <- tune(svm, DO ~.,  data = dat@data[,c("DO", names(COV))],
                   ranges = list(epsilon = seq(0.1,0.2,0.02),
                                 cost = c(5,7,15,20)))
plot(tuneResult)
tunedModel <- tuneResult$best.model
print(tunedModel)

# Predict the DO 

DOsvm <- predict(COV, tunedModel)
rmse <- function(errval)
{
  val = sqrt(mean(errval^2))
  return(val)
}

errval <- tunedModel$residuals  # same as data$Y - predictedY
svr_RMSE <- rmse(errval)   
print(paste('svr RMSE = ', 
            svr_RMSE))
writeRaster(DOsvm, filename = "data/DO_svm.tif", overwrite=TRUE)
plot(DOsvm)

# Evaluate the contribution of each covariate to the model 

w <- t(tunedModel$coefs) %*% tunedModel$SV
w <- apply(w, 2, function(v){sqrt(sum(v^2))})  
w <- sort(w, decreasing = T)
print(w)

# Validation

library(sp)
library(raster)
dat <- read.csv(file = "data/dat_test.csv")
coordinates(dat) <- ~ X + Y

DO_rf <- raster("data/DO_rf.tif")
DO_svm <- raster("data/DO_svm.tif")

dat <- extract(x = DO_rf, y = dat, sp = TRUE)
dat <- extract(x = DO_svm, y = dat, sp = TRUE)
dat$PE_rf <- dat$DO_rf - dat$DO
dat$PE_svm <- dat$DO_svm - dat$DO 
write.csv(dat, "data /validation.csv", row.names = F)

# Estimating the map quality measures

ME_rf <- mean(dat$PE_rf, na.rm=TRUE)
ME_svm <- mean(dat$PE_svm, na.rm=TRUE)
MAE_rf <- mean(abs(dat$PE_rf), na.rm=TRUE)
MAE_svm <- mean(abs(dat$PE_svm), na.rm=TRUE)
MSE_rf <- mean(dat$PE_rf^2, na.rm=TRUE)
MSE_svm <- mean(dat$PE_svm^2, na.rm=TRUE)
RMSE_rf <- sqrt(sum(dat$PE_rf^2, na.rm=TRUE) / length(dat$PE_rf))
RMSE_svm <- sqrt(sum(dat$PE_svm^2, na.rm=TRUE) / length(dat$PE_svm))
AVE_rf <- 1 - sum(dat$PE_rf^2, na.rm=TRUE) / 
  sum( (dat$DO - mean(dat$DO, na.rm = TRUE))^2, 
       na.rm = TRUE)

AVE_svm <- 1 - sum(dat$PE_svm^2, na.rm=TRUE) / 
  sum( (dat$DO - mean(dat$DO, na.rm = TRUE))^2, 
       na.rm = TRUE)

# Graphical map quality measures

par(mfrow=c(2,2))
plot(dat$DO_rf, dat$DO, main="rf", xlab="predicted", 
     ylab='observed')
abline(0,1, lty=2, col='black')
abline(lm(dat$DO ~ dat$DO_rf), col = 'blue', lty=2)

plot(dat$DO_svm, dat$DO, main="svm", xlab="predicted", 
     ylab='observed')
abline(0,1, lty=2, col='black')
abline(lm(dat$DO ~ dat$DO_svm), col = 'blue', lty=2)
par(mfrow=c(1,1))

# Spatial bubbles for prediction errors 

bubble(dat[!is.na(dat$PE_rf),], "PE_rf", pch = 21, 
       col=c('red', 'green'))
bubble(dat[!is.na(dat$PE_svm),], "PE_svm", pch = 21, 
       col=c('red', 'green'))

# Model correlations and spatial differences and evaluation

library(raster)
RF<-raster('data/DO_rf.tif')
SVM<-raster('data/DO_svm.tif')
models <- stack(RF, SVM)

# Compare the statistical distribution and correlation between the models

library(psych)
pairs.panels(na.omit(as.data.frame(models)),
             # Correlation method
             method = "pearson", 
             hist.col = "#00AFBB",
             # Show density plots
             density = TRUE,
# Show correlation ellipses

             ellipses = TRUE 
)

library(rasterVis)
library(mapview)
densityplot(models)
SD <- calc(models , sd)
mapview(SD)
RFSVM <- calc(models[[c(1,2)]], diff)
names(RFSVM) <- c('RFvsSVM')
X <- raster::cellStats(RFSVM, mean)
levelplot(RFSVM - X, at=seq(-0.5,0.5, length.out=10),
          par.settings = RdBuTheme)

# Model evaluation

dat <- read.csv("data/validation.csv")
modRF <- data.frame(obs = dat$DO, mod = dat$DO_rf, 
                    model = "RF")
modSVM <- data.frame(obs = dat$DO, mod = dat$DO_svm, 
                     model = "SVM")
modData <- rbind(modRF, modSVM)
summary(modData)

# Calculate statistical evaluation metrics

library(openair)
modsts <- modStats(modData,obs = "obs", mod = "mod", type = "model")
modsts

TaylorDiagram(modData, obs = "obs", mod = "mod", group = "model",
              cols = c("green", "yellow", "red","blue"), 
              cor.col='brown', rms.col='black')
conditionalQuantile(modData,obs = "obs", mod = "mod", type = "model")

Leave a Reply

Your email address will not be published. Required fields are marked *

10 − 7 =

This site uses Akismet to reduce spam. Learn how your comment data is processed.