library(sf)
# Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.4.0; sf_use_s2() is TRUE
#library(HSAR)
#data(depmunic)
#data(properties)
if (packageVersion("spData") > "2.2.0") {
data(depmunic, package="spData")
data(properties, package="spData")
} else {
unzip("data/PropertiesAthens.zip", files=c("depmunic.RData",
"properties.RData"), exdir="data")
load("data/depmunic.RData")
load("data/properties.RData")
}
depmunic$popdens <- depmunic$population/ (10000*depmunic$area)
depmunic$foreigners <- 100 * depmunic$pop_rest/ depmunic$population
depmunic$prgreensp <- depmunic$greensp/ (10000*depmunic$area)
16 Spatial Regression
Exercise 16.1
The archived HSAR package includes an upper level polygon support municipality department data set, ans a lower level property data set. Both are "sf"
objects, in the same projected CRS, provided locally.
The vignette PropertiesAthens.Rmd
can be extracted from the archived HSAR package, and can be run and viewed (“Knit”) in rstudio by replacing on line 18 the command library(HSAR)
with library(Matrix)
.
In the vignette, two upper-level variables are added to the six already present, and we change the green space variable scaling to avoid numerical issues in calculating coefficient standard errors.
summary(depmunic)
# num_dep airbnb museums population
# Min. :1.0 Min. : 144.0 Min. : 0.000 Min. : 45168
# 1st Qu.:2.5 1st Qu.: 377.5 1st Qu.: 0.000 1st Qu.: 78753
# Median :4.0 Median : 565.0 Median : 0.000 Median : 98283
# Mean :4.0 Mean : 712.3 Mean : 2.571 Mean : 93702
# 3rd Qu.:5.5 3rd Qu.: 675.5 3rd Qu.: 0.500 3rd Qu.:112677
# Max. :7.0 Max. :2171.0 Max. :17.000 Max. :129603
# pop_rest greensp area geometry
# Min. : 2735 Min. : 40656 Min. :3.987 POLYGON :7
# 1st Qu.: 4588 1st Qu.: 42340 1st Qu.:4.264 epsg:2100 :0
# Median : 5099 Median : 93715 Median :4.836 +proj=tmer...:0
# Mean : 7109 Mean :183796 Mean :5.420
# 3rd Qu.: 8110 3rd Qu.:294286 3rd Qu.:6.412
# Max. :16531 Max. :478951 Max. :7.764
# popdens foreigners prgreensp
# Min. :0.8039 Min. : 4.890 Min. :0.7708
# 1st Qu.:1.2979 1st Qu.: 5.058 1st Qu.:0.9653
# Median :1.8763 Median : 6.055 Median :1.2070
# Mean :1.8697 Mean : 7.369 Mean :3.3883
# 3rd Qu.:2.2807 3rd Qu.: 8.882 3rd Qu.:4.9527
# Max. :3.2508 Max. :12.755 Max. :9.9040
The properties data set has only four variables, but with price per square metre already added:
summary(properties)
# id size price
# Length:1000 Min. : 21.00 Min. : 8000
# Class :character 1st Qu.: 55.00 1st Qu.: 44750
# Mode :character Median : 75.00 Median : 80000
# Mean : 83.17 Mean : 123677
# 3rd Qu.: 98.00 3rd Qu.: 147000
# Max. :1250.00 Max. :5500000
# prpsqm age dist_metro
# Min. : 207.5 Min. : 0.00 Min. : 4.423
# 1st Qu.: 631.1 1st Qu.:11.00 1st Qu.: 338.523
# Median :1098.8 Median :42.00 Median : 537.250
# Mean :1316.9 Mean :34.16 Mean : 610.772
# 3rd Qu.:1814.2 3rd Qu.:48.00 3rd Qu.: 819.929
# Max. :9166.7 Max. :67.00 Max. :1888.856
# geometry
# POINT :1000
# epsg:2100 : 0
# +proj=tmer...: 0
#
#
#
The values of the variables in depmunic
get copied to each of the properties falling within the boundaries of the municipality departments:
properties_in_dd <- st_join(properties, depmunic, join = st_within)
Exercise 16.2
For polygon support, we prefer contiguous neighbours:
(mun_nb <- spdep::poly2nb(depmunic, row.names=as.character(depmunic$num_dep)))
# Neighbour list object:
# Number of regions: 7
# Number of nonzero links: 20
# Percentage nonzero weights: 40.81633
# Average number of links: 2.857143
Global spatial autocorrelation is marginally detected for the green space variable:
spdep::moran.test(depmunic$prgreensp, spdep::nb2listw(mun_nb))
#
# Moran I test under randomisation
#
# data: depmunic$prgreensp
# weights: spdep::nb2listw(mun_nb)
#
# Moran I statistic standard deviate = 1.825, p-value = 0.034
# alternative hypothesis: greater
# sample estimates:
# Moran I statistic Expectation Variance
# 0.22384238 -0.16666667 0.04578844
Unlike the vignette, which uses distance neighbours up to 1300 m and creates a very dense representation, we choose k=4
k-nearest neighbours, then convert to symmetry (note that some point locations are duplicated, preventing the use of spatial indexing):
(pr_nb_k4s <- spdep::knn2nb(spdep::knearneigh(properties, k=4), sym=TRUE, row.names=properties$id))
# Warning in spdep::knearneigh(properties, k = 4): knearneigh:
# identical points found
# Warning in spdep::knearneigh(properties, k = 4): knearneigh:
# kd_tree not available for identical points
# Warning in spdep::knn2nb(spdep::knearneigh(properties, k = 4), sym
# = TRUE, : neighbour object has 5 sub-graphs
# Neighbour list object:
# Number of regions: 1000
# Number of nonzero links: 5874
# Percentage nonzero weights: 0.5874
# Average number of links: 5.874
# 5 disjoint connected subgraphs
Copying out has led to the introduction of very powerful positive spatial autocorrelation in this and other variables copied out:
spdep::moran.test(properties_in_dd$prgreensp, spdep::nb2listw(pr_nb_k4s))
#
# Moran I test under randomisation
#
# data: properties_in_dd$prgreensp
# weights: spdep::nb2listw(pr_nb_k4s)
#
# Moran I statistic standard deviate = 51.666, p-value <
# 2.2e-16
# alternative hypothesis: greater
# sample estimates:
# Moran I statistic Expectation Variance
# 0.9757099996 -0.0010010010 0.0003573715
Exercise 16.3
The vignette proposes the full property level and municipal department level set of variables straight away. Here we choose the property level ones first, and update for the copied out municipal department level ones next:
f_pr <- prpsqm ~ size + age + dist_metro
f_pr_md <- update(f_pr, . ~ . + foreigners + prgreensp + popdens + museums + airbnb)
Adding in the copied out upper level variables appears to account for more of the variability of the response than leaving them out:
library(mgcv)
# Loading required package: nlme
# This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
pr_base <- gam(f_pr, data=properties_in_dd)
pr_2lev <- gam(f_pr_md, data=properties_in_dd)
anova(pr_base, pr_2lev, test="Chisq")
# Analysis of Deviance Table
#
# Model 1: prpsqm ~ size + age + dist_metro
# Model 2: prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens +
# museums + airbnb
# Resid. Df Resid. Dev Df Deviance Pr(>Chi)
# 1 996 625149227
# 2 991 524571512 5 100577716 < 2.2e-16 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(pr_base)
#
# Family: gaussian
# Link function: identity
#
# Formula:
# prpsqm ~ size + age + dist_metro
#
# Parametric coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1702.61693 78.59752 21.66 < 2e-16 ***
# size 5.18681 0.46510 11.15 < 2e-16 ***
# age -18.11938 1.33737 -13.55 < 2e-16 ***
# dist_metro -0.32446 0.07115 -4.56 5.75e-06 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
#
# R-sq.(adj) = 0.234 Deviance explained = 23.7%
# GCV = 6.3018e+05 Scale est. = 6.2766e+05 n = 1000
summary(pr_2lev)
#
# Family: gaussian
# Link function: identity
#
# Formula:
# prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens +
# museums + airbnb
#
# Parametric coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1648.17176 189.84951 8.681 <2e-16 ***
# size 4.29551 0.43386 9.901 <2e-16 ***
# age -20.18585 1.27009 -15.893 <2e-16 ***
# dist_metro -0.15180 0.07159 -2.120 0.0342 *
# foreigners -38.13473 22.54311 -1.692 0.0910 .
# prgreensp 23.90080 16.33291 1.463 0.1437
# popdens -51.82590 103.65578 -0.500 0.6172
# museums -19.06125 21.27904 -0.896 0.3706
# airbnb 0.63284 0.32887 1.924 0.0546 .
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
#
# R-sq.(adj) = 0.354 Deviance explained = 36%
# GCV = 5.3414e+05 Scale est. = 5.2934e+05 n = 1000
Adding an upper level IID random effect to the base formula also improves the fit of the model substantially:
pr_base_iid <- gam(update(f_pr, . ~ . + s(num_dep, bs="re")), data=properties_in_dd)
anova(pr_base, pr_base_iid, test="Chisq")
# Analysis of Deviance Table
#
# Model 1: prpsqm ~ size + age + dist_metro
# Model 2: prpsqm ~ size + age + dist_metro + s(num_dep, bs = "re")
# Resid. Df Resid. Dev Df Deviance Pr(>Chi)
# 1 996 625149227
# 2 995 557913777 0.99993 67235450 < 2.2e-16 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(pr_base_iid)
#
# Family: gaussian
# Link function: identity
#
# Formula:
# prpsqm ~ size + age + dist_metro + s(num_dep, bs = "re")
#
# Parametric coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 2244.07579 89.35422 25.114 < 2e-16 ***
# size 4.63594 0.44249 10.477 < 2e-16 ***
# age -19.12405 1.26739 -15.089 < 2e-16 ***
# dist_metro -0.18383 0.06848 -2.685 0.00738 **
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Approximate significance of smooth terms:
# edf Ref.df F p-value
# s(num_dep) 0.9916 1 118.9 <2e-16 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# R-sq.(adj) = 0.316 Deviance explained = 31.9%
# GCV = 5.6353e+05 Scale est. = 5.6071e+05 n = 1000
This improvement is much more moderate when both the upper level variables and IID random effect are present:
pr_2lev_iid <- gam(update(f_pr_md, . ~ . + s(num_dep, bs="re")), data=properties_in_dd)
anova(pr_2lev, pr_2lev_iid, test="Chisq")
# Analysis of Deviance Table
#
# Model 1: prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens +
# museums + airbnb
# Model 2: prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens +
# museums + airbnb + s(num_dep, bs = "re")
# Resid. Df Resid. Dev Df Deviance Pr(>Chi)
# 1 991.00 524571512
# 2 990.04 522144057 0.95676 2427454 0.02981 *
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(pr_2lev_iid)
#
# Family: gaussian
# Link function: identity
#
# Formula:
# prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens +
# museums + airbnb + s(num_dep, bs = "re")
#
# Parametric coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1520.74633 200.41291 7.588 7.48e-14 ***
# size 4.29515 0.43303 9.919 < 2e-16 ***
# age -20.09458 1.26851 -15.841 < 2e-16 ***
# dist_metro -0.14574 0.07152 -2.038 0.04184 *
# foreigners -83.04410 32.17854 -2.581 0.01000 *
# prgreensp -68.29107 49.95934 -1.367 0.17196
# popdens 261.72640 191.05195 1.370 0.17102
# museums -125.97470 58.73993 -2.145 0.03223 *
# airbnb 1.92092 0.73695 2.607 0.00928 **
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Approximate significance of smooth terms:
# edf Ref.df F p-value
# s(num_dep) 0.7921 1 3.811 0.0285 *
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# R-sq.(adj) = 0.357 Deviance explained = 36.3%
# GCV = 5.3252e+05 Scale est. = 5.2731e+05 n = 1000
Exercise 16.4
The "mrf"
smooth term needs ID keys set so that the neighbour object is correctly matched to the observations. Once these are provided, the properties level model with a municipality department level MRF smooth may be fit:
names(mun_nb) <- attr(mun_nb, "region.id")
properties_in_dd$num_dep <- factor(properties_in_dd$num_dep)
pr_base_mrf <- gam(update(f_pr, . ~ . + s(num_dep, bs="mrf", xt=list(nb=mun_nb))),
data=properties_in_dd)
summary(pr_base_mrf)
#
# Family: gaussian
# Link function: identity
#
# Formula:
# prpsqm ~ size + age + dist_metro + s(num_dep, bs = "mrf", xt = list(nb = mun_nb))
#
# Parametric coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1730.81146 76.44218 22.642 <2e-16 ***
# size 4.32010 0.43280 9.982 <2e-16 ***
# age -20.00432 1.26761 -15.781 <2e-16 ***
# dist_metro -0.14718 0.07142 -2.061 0.0396 *
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Approximate significance of smooth terms:
# edf Ref.df F p-value
# s(num_dep) 5.852 6 31.9 <2e-16 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# R-sq.(adj) = 0.357 Deviance explained = 36.3%
# GCV = 5.3255e+05 Scale est. = 5.2731e+05 n = 1000
Repeating for the extended model with upper level variables present, we see that no more response variability is accounted for than in the lower level variables only MRF RE model, and none of the upper level variables are significant at conventional levels.
pr_2lev_mrf <- gam(update(f_pr_md, . ~ . + s(num_dep, bs="mrf", xt=list(nb=mun_nb))),
data=properties_in_dd)
summary(pr_2lev_mrf)
#
# Family: gaussian
# Link function: identity
#
# Formula:
# prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens +
# museums + airbnb + s(num_dep, bs = "mrf", xt = list(nb = mun_nb))
#
# Parametric coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1510.58909 425.16218 3.553 0.000399 ***
# size 4.29515 0.43303 9.919 < 2e-16 ***
# age -20.09456 1.26851 -15.841 < 2e-16 ***
# dist_metro -0.14574 0.07152 -2.038 0.041844 *
# foreigners -42.18041 58.30966 -0.723 0.469613
# prgreensp 15.34415 54.84749 0.280 0.779720
# popdens -50.09688 259.59932 -0.193 0.847016
# museums -53.40432 55.25567 -0.966 0.334033
# airbnb 1.01045 0.81453 1.241 0.215072
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Approximate significance of smooth terms:
# edf Ref.df F p-value
# s(num_dep) 0.7922 1 3.812 0.0285 *
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# R-sq.(adj) = 0.357 Deviance explained = 36.3%
# GCV = 5.3252e+05 Scale est. = 5.2731e+05 n = 1000
It also seems that the model without upper level variables outperforms that with them included:
anova(pr_base_mrf, pr_2lev_mrf, test="Chisq")
# Analysis of Deviance Table
#
# Model 1: prpsqm ~ size + age + dist_metro + s(num_dep, bs = "mrf", xt = list(nb = mun_nb))
# Model 2: prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens +
# museums + airbnb + s(num_dep, bs = "mrf", xt = list(nb = mun_nb))
# Resid. Df Resid. Dev Df Deviance Pr(>Chi)
# 1 990.01 522113016
# 2 990.04 522143950 -0.037064 -30934 0.054 .
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
and the MRF RE outperforms the IID RE:
anova(pr_base_mrf, pr_base_iid, test="Chisq")
# Analysis of Deviance Table
#
# Model 1: prpsqm ~ size + age + dist_metro + s(num_dep, bs = "mrf", xt = list(nb = mun_nb))
# Model 2: prpsqm ~ size + age + dist_metro + s(num_dep, bs = "re")
# Resid. Df Resid. Dev Df Deviance Pr(>Chi)
# 1 990.01 522113016
# 2 995.00 557913777 -4.9939 -35800762 2.786e-13 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1