library(tidyverse)
# ── Attaching core tidyverse packages ──────────── tidyverse 2.0.0 ──
# ✔ dplyr 1.1.4 ✔ readr 2.1.5
# ✔ forcats 1.0.0 ✔ stringr 1.5.1
# ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
# ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
# ✔ purrr 1.0.2
# ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
# ✖ dplyr::filter() masks stats::filter()
# ✖ dplyr::lag() masks stats::lag()
# ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
# Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.4.0; sf_use_s2() is TRUE
system.file("gpkg/nc.gpkg", package="sf") |>
read_sf() -> nc
nc |> mutate(SID = SID74/BIR74, NWB = NWBIR74/BIR74) -> nc1
library(randomForest) |> suppressPackageStartupMessages()
r = randomForest(SID ~ NWB, nc1)
nc1$rf = predict(r)
plot(rf~SID, nc1)
abline(0, 1)
10 Statistical models for spatial data
Exercise 10.1
following the lm
example of Section 10.2 use a random forest model to predict SID
values (e.g. using package randomForest), and plot the random forest predictions against observations, along with the \(x=y\) line.
Exercise 10.2
Create a new dataset by randomly sampling 1000 points from the nc
dataset, and rerun the linear regression model of section 10.2 on this dataset. What has changed?
pts = st_sample(nc, 1000)
nc2 = st_intersection(nc1, pts)
# Warning: attribute variables are assumed to be spatially constant
# throughout all geometries
lm(SID ~ NWB, nc1) |> summary()
#
# Call:
# lm(formula = SID ~ NWB, data = nc1)
#
# Residuals:
# Min 1Q Median 3Q Max
# -0.0033253 -0.0007411 -0.0000691 0.0005479 0.0062218
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.0006773 0.0002327 2.910 0.00447 **
# NWB 0.0043785 0.0006204 7.058 2.44e-10 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 0.001288 on 98 degrees of freedom
# Multiple R-squared: 0.337, Adjusted R-squared: 0.3302
# F-statistic: 49.82 on 1 and 98 DF, p-value: 2.438e-10
lm(SID ~ NWB, nc2) |> summary()
#
# Call:
# lm(formula = SID ~ NWB, data = nc2)
#
# Residuals:
# Min 1Q Median 3Q Max
# -0.0033222 -0.0008319 -0.0000049 0.0005459 0.0062251
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 7.444e-04 7.166e-05 10.39 <2e-16 ***
# NWB 4.263e-03 1.895e-04 22.49 <2e-16 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 0.001214 on 998 degrees of freedom
# Multiple R-squared: 0.3364, Adjusted R-squared: 0.3357
# F-statistic: 505.9 on 1 and 998 DF, p-value: < 2.2e-16
we see that the standard error has decreased with a factor 3 (sqrt(10)).
For prediction interval widths:
no change, as this dominated by the residual variance;
Confidence intervals for the predicted means:
drops for larger dataset, as this is dominated by the standard errors of estimated coefficients.
Exercise 10.3
Redo the water-land classification of section 7.4 using class::knn
instead of lda
.
Preparing the dataset:
tif <- system.file("tif/L7_ETMs.tif", package = "stars")
library(stars)
# Loading required package: abind
(r <- read_stars(tif))
# stars object with 3 dimensions and 1 attribute
# attribute(s):
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# L7_ETMs.tif 1 54 69 68.91242 86 255
# dimension(s):
# from to offset delta refsys point x/y
# x 1 349 288776 28.5 SIRGAS 2000 / ... FALSE [x]
# y 1 352 9120761 -28.5 SIRGAS 2000 / ... FALSE [y]
# band 1 6 NA NA NA NA
set.seed(115517)
pts <- st_bbox(r) |> st_as_sfc() |> st_sample(20)
(e <- st_extract(r, pts))
# stars object with 2 dimensions and 1 attribute
# attribute(s):
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# L7_ETMs.tif 12 41.75 63 60.95833 80.5 145
# dimension(s):
# from to refsys point
# geometry 1 20 SIRGAS 2000 / ... TRUE
# band 1 6 NA NA
# values
# geometry POINT (293002....,...,POINT (290941....
# band NULL
plot(r[,,,1], reset = FALSE)
col <- rep("yellow", 20)
col[c(8, 14, 15, 18, 19)] = "red"
st_as_sf(e) |> st_coordinates() |> text(labels = 1:20, col = col)
rs <- split(r)
trn <- st_extract(rs, pts)
trn$cls <- rep("land", 20)
trn$cls[c(8, 14, 15, 18, 19)] <- "water"
estimation and prediction happen in one command:
library(class)
as.data.frame(trn) |> select(X1, X2, X3, X4, X5, X6) -> tr
as.data.frame(rs) |> select(X1, X2, X3, X4, X5, X6) -> test
rs$cls = knn(tr, test, trn$cl, k = 5)
plot(rs["cls"])
Exercise 10.4
For the nc
data: estimation
st_centroid(nc1) |> st_coordinates() -> cc
# Warning: st_centroid assumes attributes are constant over
# geometries
bind_cols(nc1, cc) |> transmute(X=X, Y=Y, SID=SID, NWB=NWB) -> nc2
(lm0 <- lm(SID ~ NWB, nc1)) |> summary()
#
# Call:
# lm(formula = SID ~ NWB, data = nc1)
#
# Residuals:
# Min 1Q Median 3Q Max
# -0.0033253 -0.0007411 -0.0000691 0.0005479 0.0062218
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.0006773 0.0002327 2.910 0.00447 **
# NWB 0.0043785 0.0006204 7.058 2.44e-10 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 0.001288 on 98 degrees of freedom
# Multiple R-squared: 0.337, Adjusted R-squared: 0.3302
# F-statistic: 49.82 on 1 and 98 DF, p-value: 2.438e-10
(lm1 <- lm(SID ~ NWB+X+Y, nc2)) |> summary()
#
# Call:
# lm(formula = SID ~ NWB + X + Y, data = nc2)
#
# Residuals:
# Min 1Q Median 3Q Max
# -0.0027669 -0.0007998 -0.0001568 0.0006015 0.0053235
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -1.248e-03 1.050e-02 -0.119 0.90564
# NWB 5.950e-03 7.983e-04 7.454 3.99e-11 ***
# X -2.266e-04 7.794e-05 -2.907 0.00453 **
# Y -4.655e-04 2.167e-04 -2.148 0.03424 *
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 0.001213 on 96 degrees of freedom
# Multiple R-squared: 0.424, Adjusted R-squared: 0.406
# F-statistic: 23.56 on 3 and 96 DF, p-value: 1.645e-11
(lm2 <- lm(SID ~ NWB+X+Y+I(X^2)+I(Y^2)+X*Y, nc2)) |> summary()
#
# Call:
# lm(formula = SID ~ NWB + X + Y + I(X^2) + I(Y^2) + X * Y, data = nc2)
#
# Residuals:
# Min 1Q Median 3Q Max
# -0.0027759 -0.0007526 -0.0001441 0.0006399 0.0053102
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 4.703e-01 7.119e-01 0.661 0.511
# NWB 5.978e-03 9.463e-04 6.317 9.08e-09 ***
# X -2.632e-04 1.016e-02 -0.026 0.979
# Y -2.719e-02 2.576e-02 -1.055 0.294
# I(X^2) -1.064e-05 3.387e-05 -0.314 0.754
# I(Y^2) 3.240e-04 3.445e-04 0.941 0.349
# X:Y -4.714e-05 1.688e-04 -0.279 0.781
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 0.001221 on 93 degrees of freedom
# Multiple R-squared: 0.4339, Adjusted R-squared: 0.3974
# F-statistic: 11.88 on 6 and 93 DF, p-value: 7.383e-10
The first order model seems to have significant coordinate effects, for the second order model none of the coordinate effects are significant.
Prediction:
nc1$pr0 <- lm0 |> predict(nc2)
nc1$pr1 <- lm1 |> predict(nc2)
nc1$pr2 <- lm2 |> predict(nc2)
nc1[c("pr0", "pr1", "pr2")] |> st_as_stars() |> merge() |> plot(breaks = "equal")
Largely the same pattern is shown in the predictions, some extremes get more extreme.
For the knn
on the remote sensing data:
cbind(as.data.frame(trn), st_coordinates(trn)) |>
select(X, Y, X1, X2, X3, X4, X5, X6) -> tr1
as.data.frame(rs) |> transmute(X=x, Y=y, X1, X2, X3, X4, X5, X6) -> test1
rs$cls1 = knn(tr1, test1, trn$cl, k = 5)
cbind(as.data.frame(trn), st_coordinates(trn)) |>
transmute(X, Y, X2=X^2, Y2=Y^2, XY=X*Y, X1, X2, X3, X4, X5, X6) -> tr2
as.data.frame(rs) |>
transmute(X=x, Y=y, X2=X^2, Y2=Y^2, XY=X*Y, X1, X2, X3, X4, X5, X6) -> test2
rs$cls2 = knn(tr2, test2, trn$cl, k = 5)
rs[c("cls", "cls1", "cls2")] |> merge() |> plot()
Both models involving coordinates show much worse results!