16  Spatial Regression

Exercise 16.1

The archived HSAR package includes an upper level polygon support municipality department data set, ans a lower level property data set. Both are "sf" objects, in the same projected CRS, provided locally.

library(sf)
# Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE
#library(HSAR)
#data(depmunic)
#data(properties)
if (packageVersion("spData") > "2.2.0") {
  data(depmunic, package="spData")
  data(properties, package="spData")
} else {
  unzip("data/PropertiesAthens.zip", files=c("depmunic.RData",
    "properties.RData"), exdir="data")
  load("data/depmunic.RData")
  load("data/properties.RData")
}
depmunic$popdens <- depmunic$population/ (10000*depmunic$area)
depmunic$foreigners <- 100 * depmunic$pop_rest/ depmunic$population
depmunic$prgreensp <- depmunic$greensp/ (10000*depmunic$area)

The vignette PropertiesAthens.Rmd can be extracted from the archived HSAR package, and can be run and viewed (“Knit”) in rstudio by replacing on line 18 the command library(HSAR) with library(Matrix).

In the vignette, two upper-level variables are added to the six already present, and we change the green space variable scaling to avoid numerical issues in calculating coefficient standard errors.

summary(depmunic)
#     num_dep        airbnb          museums         population    
#  Min.   :1.0   Min.   : 144.0   Min.   : 0.000   Min.   : 45168  
#  1st Qu.:2.5   1st Qu.: 377.5   1st Qu.: 0.000   1st Qu.: 78753  
#  Median :4.0   Median : 565.0   Median : 0.000   Median : 98283  
#  Mean   :4.0   Mean   : 712.3   Mean   : 2.571   Mean   : 93702  
#  3rd Qu.:5.5   3rd Qu.: 675.5   3rd Qu.: 0.500   3rd Qu.:112677  
#  Max.   :7.0   Max.   :2171.0   Max.   :17.000   Max.   :129603  
#     pop_rest        greensp            area                geometry
#  Min.   : 2735   Min.   : 40656   Min.   :3.987   POLYGON      :7  
#  1st Qu.: 4588   1st Qu.: 42340   1st Qu.:4.264   epsg:2100    :0  
#  Median : 5099   Median : 93715   Median :4.836   +proj=tmer...:0  
#  Mean   : 7109   Mean   :183796   Mean   :5.420                    
#  3rd Qu.: 8110   3rd Qu.:294286   3rd Qu.:6.412                    
#  Max.   :16531   Max.   :478951   Max.   :7.764                    
#     popdens         foreigners       prgreensp     
#  Min.   :0.8039   Min.   : 4.890   Min.   :0.7708  
#  1st Qu.:1.2979   1st Qu.: 5.058   1st Qu.:0.9653  
#  Median :1.8763   Median : 6.055   Median :1.2070  
#  Mean   :1.8697   Mean   : 7.369   Mean   :3.3883  
#  3rd Qu.:2.2807   3rd Qu.: 8.882   3rd Qu.:4.9527  
#  Max.   :3.2508   Max.   :12.755   Max.   :9.9040

The properties data set has only four variables, but with price per square metre already added:

summary(properties)
#       id                 size             price        
#  Length:1000        Min.   :  21.00   Min.   :   8000  
#  Class :character   1st Qu.:  55.00   1st Qu.:  44750  
#  Mode  :character   Median :  75.00   Median :  80000  
#                     Mean   :  83.17   Mean   : 123677  
#                     3rd Qu.:  98.00   3rd Qu.: 147000  
#                     Max.   :1250.00   Max.   :5500000  
#      prpsqm            age          dist_metro      
#  Min.   : 207.5   Min.   : 0.00   Min.   :   4.423  
#  1st Qu.: 631.1   1st Qu.:11.00   1st Qu.: 338.523  
#  Median :1098.8   Median :42.00   Median : 537.250  
#  Mean   :1316.9   Mean   :34.16   Mean   : 610.772  
#  3rd Qu.:1814.2   3rd Qu.:48.00   3rd Qu.: 819.929  
#  Max.   :9166.7   Max.   :67.00   Max.   :1888.856  
#           geometry   
#  POINT        :1000  
#  epsg:2100    :   0  
#  +proj=tmer...:   0  
#                      
#                      
# 

The values of the variables in depmunic get copied to each of the properties falling within the boundaries of the municipality departments:

properties_in_dd <- st_join(properties, depmunic, join = st_within)

Exercise 16.2

For polygon support, we prefer contiguous neighbours:

(mun_nb <- spdep::poly2nb(depmunic, row.names=as.character(depmunic$num_dep)))
# Neighbour list object:
# Number of regions: 7 
# Number of nonzero links: 20 
# Percentage nonzero weights: 40.81633 
# Average number of links: 2.857143

Global spatial autocorrelation is marginally detected for the green space variable:

spdep::moran.test(depmunic$prgreensp, spdep::nb2listw(mun_nb))
# 
#   Moran I test under randomisation
# 
# data:  depmunic$prgreensp  
# weights: spdep::nb2listw(mun_nb)    
# 
# Moran I statistic standard deviate = 1.825, p-value = 0.034
# alternative hypothesis: greater
# sample estimates:
# Moran I statistic       Expectation          Variance 
#        0.22384238       -0.16666667        0.04578844

Unlike the vignette, which uses distance neighbours up to 1300 m and creates a very dense representation, we choose k=4 k-nearest neighbours, then convert to symmetry (note that some point locations are duplicated, preventing the use of spatial indexing):

(pr_nb_k4s <- spdep::knn2nb(spdep::knearneigh(properties, k=4), sym=TRUE, row.names=properties$id))
# Warning in spdep::knearneigh(properties, k = 4): knearneigh:
# identical points found
# Warning in spdep::knearneigh(properties, k = 4): knearneigh:
# kd_tree not available for identical points
# Neighbour list object:
# Number of regions: 1000 
# Number of nonzero links: 5874 
# Percentage nonzero weights: 0.5874 
# Average number of links: 5.874 
# 5 disjoint connected subgraphs

Copying out has led to the introduction of very powerful positive spatial autocorrelation in this and other variables copied out:

spdep::moran.test(properties_in_dd$prgreensp, spdep::nb2listw(pr_nb_k4s))
# 
#   Moran I test under randomisation
# 
# data:  properties_in_dd$prgreensp  
# weights: spdep::nb2listw(pr_nb_k4s)    
# 
# Moran I statistic standard deviate = 51.666, p-value <
# 2.2e-16
# alternative hypothesis: greater
# sample estimates:
# Moran I statistic       Expectation          Variance 
#      0.9757099996     -0.0010010010      0.0003573715

Exercise 16.3

The vignette proposes the full property level and municipal department level set of variables straight away. Here we choose the property level ones first, and update for the copied out municipal department level ones next:

f_pr <- prpsqm ~ size + age + dist_metro
f_pr_md <- update(f_pr, . ~ . + foreigners + prgreensp + popdens + museums + airbnb)

Adding in the copied out upper level variables appears to account for more of the variability of the response than leaving them out:

library(mgcv)
# Loading required package: nlme
# This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
pr_base <- gam(f_pr, data=properties_in_dd)
pr_2lev <- gam(f_pr_md, data=properties_in_dd)
anova(pr_base, pr_2lev, test="Chisq")
# Analysis of Deviance Table
# 
# Model 1: prpsqm ~ size + age + dist_metro
# Model 2: prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens + 
#     museums + airbnb
#   Resid. Df Resid. Dev Df  Deviance  Pr(>Chi)    
# 1       996  625149227                           
# 2       991  524571512  5 100577716 < 2.2e-16 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(pr_base)
# 
# Family: gaussian 
# Link function: identity 
# 
# Formula:
# prpsqm ~ size + age + dist_metro
# 
# Parametric coefficients:
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1702.61693   78.59752   21.66  < 2e-16 ***
# size           5.18681    0.46510   11.15  < 2e-16 ***
# age          -18.11938    1.33737  -13.55  < 2e-16 ***
# dist_metro    -0.32446    0.07115   -4.56 5.75e-06 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# 
# R-sq.(adj) =  0.234   Deviance explained = 23.7%
# GCV = 6.3018e+05  Scale est. = 6.2766e+05  n = 1000
summary(pr_2lev)
# 
# Family: gaussian 
# Link function: identity 
# 
# Formula:
# prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens + 
#     museums + airbnb
# 
# Parametric coefficients:
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1648.17176  189.84951   8.681   <2e-16 ***
# size           4.29551    0.43386   9.901   <2e-16 ***
# age          -20.18585    1.27009 -15.893   <2e-16 ***
# dist_metro    -0.15180    0.07159  -2.120   0.0342 *  
# foreigners   -38.13473   22.54311  -1.692   0.0910 .  
# prgreensp     23.90080   16.33291   1.463   0.1437    
# popdens      -51.82590  103.65578  -0.500   0.6172    
# museums      -19.06125   21.27904  -0.896   0.3706    
# airbnb         0.63284    0.32887   1.924   0.0546 .  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# 
# R-sq.(adj) =  0.354   Deviance explained =   36%
# GCV = 5.3414e+05  Scale est. = 5.2934e+05  n = 1000

Adding an upper level IID random effect to the base formula also improves the fit of the model substantially:

pr_base_iid <- gam(update(f_pr, . ~ . + s(num_dep, bs="re")), data=properties_in_dd)
anova(pr_base, pr_base_iid, test="Chisq")
# Analysis of Deviance Table
# 
# Model 1: prpsqm ~ size + age + dist_metro
# Model 2: prpsqm ~ size + age + dist_metro + s(num_dep, bs = "re")
#   Resid. Df Resid. Dev      Df Deviance  Pr(>Chi)    
# 1       996  625149227                               
# 2       995  557913777 0.99993 67235450 < 2.2e-16 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(pr_base_iid)
# 
# Family: gaussian 
# Link function: identity 
# 
# Formula:
# prpsqm ~ size + age + dist_metro + s(num_dep, bs = "re")
# 
# Parametric coefficients:
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 2244.07579   89.35422  25.114  < 2e-16 ***
# size           4.63594    0.44249  10.477  < 2e-16 ***
# age          -19.12405    1.26739 -15.089  < 2e-16 ***
# dist_metro    -0.18383    0.06848  -2.685  0.00738 ** 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Approximate significance of smooth terms:
#               edf Ref.df     F p-value    
# s(num_dep) 0.9916      1 118.9  <2e-16 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# R-sq.(adj) =  0.316   Deviance explained = 31.9%
# GCV = 5.6353e+05  Scale est. = 5.6071e+05  n = 1000

This improvement is much more moderate when both the upper level variables and IID random effect are present:

pr_2lev_iid <- gam(update(f_pr_md, . ~ . + s(num_dep, bs="re")), data=properties_in_dd)
anova(pr_2lev, pr_2lev_iid, test="Chisq")
# Analysis of Deviance Table
# 
# Model 1: prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens + 
#     museums + airbnb
# Model 2: prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens + 
#     museums + airbnb + s(num_dep, bs = "re")
#   Resid. Df Resid. Dev      Df Deviance Pr(>Chi)  
# 1    991.00  524571512                            
# 2    990.04  522144057 0.95676  2427454  0.02981 *
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(pr_2lev_iid)
# 
# Family: gaussian 
# Link function: identity 
# 
# Formula:
# prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens + 
#     museums + airbnb + s(num_dep, bs = "re")
# 
# Parametric coefficients:
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1520.74633  200.41291   7.588 7.48e-14 ***
# size           4.29515    0.43303   9.919  < 2e-16 ***
# age          -20.09458    1.26851 -15.841  < 2e-16 ***
# dist_metro    -0.14574    0.07152  -2.038  0.04184 *  
# foreigners   -83.04410   32.17854  -2.581  0.01000 *  
# prgreensp    -68.29107   49.95934  -1.367  0.17196    
# popdens      261.72640  191.05195   1.370  0.17102    
# museums     -125.97470   58.73993  -2.145  0.03223 *  
# airbnb         1.92092    0.73695   2.607  0.00928 ** 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Approximate significance of smooth terms:
#               edf Ref.df     F p-value  
# s(num_dep) 0.7921      1 3.811  0.0285 *
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# R-sq.(adj) =  0.357   Deviance explained = 36.3%
# GCV = 5.3252e+05  Scale est. = 5.2731e+05  n = 1000

Exercise 16.4

The "mrf" smooth term needs ID keys set so that the neighbour object is correctly matched to the observations. Once these are provided, the properties level model with a municipality department level MRF smooth may be fit:

names(mun_nb) <- attr(mun_nb, "region.id")
properties_in_dd$num_dep <- factor(properties_in_dd$num_dep)
pr_base_mrf <- gam(update(f_pr, . ~ . + s(num_dep, bs="mrf", xt=list(nb=mun_nb))),
    data=properties_in_dd)
summary(pr_base_mrf)
# 
# Family: gaussian 
# Link function: identity 
# 
# Formula:
# prpsqm ~ size + age + dist_metro + s(num_dep, bs = "mrf", xt = list(nb = mun_nb))
# 
# Parametric coefficients:
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1730.81146   76.44218  22.642   <2e-16 ***
# size           4.32010    0.43280   9.982   <2e-16 ***
# age          -20.00432    1.26761 -15.781   <2e-16 ***
# dist_metro    -0.14718    0.07142  -2.061   0.0396 *  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Approximate significance of smooth terms:
#              edf Ref.df    F p-value    
# s(num_dep) 5.852      6 31.9  <2e-16 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# R-sq.(adj) =  0.357   Deviance explained = 36.3%
# GCV = 5.3255e+05  Scale est. = 5.2731e+05  n = 1000

Repeating for the extended model with upper level variables present, we see that no more response variability is accounted for than in the lower level variables only MRF RE model, and none of the upper level variables are significant at conventional levels.

pr_2lev_mrf <- gam(update(f_pr_md, . ~ . + s(num_dep, bs="mrf", xt=list(nb=mun_nb))),
    data=properties_in_dd)
summary(pr_2lev_mrf)
# 
# Family: gaussian 
# Link function: identity 
# 
# Formula:
# prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens + 
#     museums + airbnb + s(num_dep, bs = "mrf", xt = list(nb = mun_nb))
# 
# Parametric coefficients:
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1510.58909  425.16218   3.553 0.000399 ***
# size           4.29515    0.43303   9.919  < 2e-16 ***
# age          -20.09456    1.26851 -15.841  < 2e-16 ***
# dist_metro    -0.14574    0.07152  -2.038 0.041844 *  
# foreigners   -42.18041   58.30966  -0.723 0.469613    
# prgreensp     15.34415   54.84749   0.280 0.779720    
# popdens      -50.09688  259.59932  -0.193 0.847016    
# museums      -53.40432   55.25567  -0.966 0.334033    
# airbnb         1.01045    0.81453   1.241 0.215072    
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Approximate significance of smooth terms:
#               edf Ref.df     F p-value  
# s(num_dep) 0.7922      1 3.812  0.0285 *
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# R-sq.(adj) =  0.357   Deviance explained = 36.3%
# GCV = 5.3252e+05  Scale est. = 5.2731e+05  n = 1000

It also seems that the model without upper level variables outperforms that with them included:

anova(pr_base_mrf, pr_2lev_mrf, test="Chisq")
# Analysis of Deviance Table
# 
# Model 1: prpsqm ~ size + age + dist_metro + s(num_dep, bs = "mrf", xt = list(nb = mun_nb))
# Model 2: prpsqm ~ size + age + dist_metro + foreigners + prgreensp + popdens + 
#     museums + airbnb + s(num_dep, bs = "mrf", xt = list(nb = mun_nb))
#   Resid. Df Resid. Dev        Df Deviance Pr(>Chi)  
# 1    990.01  522113016                              
# 2    990.04  522143950 -0.037064   -30934    0.054 .
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

and the MRF RE outperforms the IID RE:

anova(pr_base_mrf, pr_base_iid, test="Chisq")
# Analysis of Deviance Table
# 
# Model 1: prpsqm ~ size + age + dist_metro + s(num_dep, bs = "mrf", xt = list(nb = mun_nb))
# Model 2: prpsqm ~ size + age + dist_metro + s(num_dep, bs = "re")
#   Resid. Df Resid. Dev      Df  Deviance  Pr(>Chi)    
# 1    990.01  522113016                                
# 2    995.00  557913777 -4.9939 -35800762 2.786e-13 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
save(list = ls(), file = "data/ch16.RData")