Exercises

For the exercises it is best to install a fresh copy of stars, using

if (!require(remotes))
  install.packages("remotes")
remotes::install_github("r-spatial/stars")

and to install starsdata, which is about 1 Gb in size (don't forget to remove afterwards):

install.packages("starsdata", repos = "https://gis-bigdata.uni-muenster.de/pebesma", type = "source")

What are data cubes?

  • what are data cubes? examples?
  • what defines data cubes?
  • are datacubes three-dimensional?
  • are tables (e.g. data.frames) data cubes?
  • is a raster dataset (e.g. a RasterLayer, or single-band GeoTIFF) a data cube?
  • what defines a dimension of a data cube?
library(stars)
## Loading required package: abind
## Loading required package: sf
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 7.0.0

Vector data cube: panel data

Panel data are time series data collected for a set of subjects (persons, companies, countries etc). That makes them (i) multidimensional, and (ii) spatiotemporal, since the subjects are typically spatial entities (though they might move).

library(ggplot2)
data(Produc, package = "plm")
head(Produc)
##     state year region     pcap     hwy   water    util       pc   gsp    emp
## 1 ALABAMA 1970      6 15032.67 7325.80 1655.68 6051.20 35793.80 28418 1010.5
## 2 ALABAMA 1971      6 15501.94 7525.94 1721.02 6254.98 37299.91 29375 1021.9
## 3 ALABAMA 1972      6 15972.41 7765.42 1764.75 6442.23 38670.30 31303 1072.3
## 4 ALABAMA 1973      6 16406.26 7907.66 1742.41 6756.19 40084.01 33430 1135.5
## 5 ALABAMA 1974      6 16762.67 8025.52 1734.85 7002.29 42057.31 33749 1169.8
## 6 ALABAMA 1975      6 17316.26 8158.23 1752.27 7405.76 43971.71 33604 1155.4
##   unemp
## 1   4.7
## 2   5.2
## 3   4.7
## 4   3.9
## 5   5.5
## 6   7.7
ggplot(Produc) + geom_raster(aes(y = state, x = year, fill = pcap))

this plot shows the raw pcap (public capital stock) values, which mainly demonstrates that large states are large. Normalizing them by dividing through the temporal means, we see more structure:

(s = st_as_stars(Produc, y_decreasing = FALSE))
## stars object with 2 dimensions and 9 attributes
## attribute(s):
##     region           pcap              hwy            water        
##  Min.   :1.000   Min.   :  2627   Min.   : 1827   Min.   :  228.5  
##  1st Qu.:3.000   1st Qu.:  7097   1st Qu.: 3858   1st Qu.:  764.5  
##  Median :5.000   Median : 17572   Median : 7556   Median : 2266.5  
##  Mean   :4.958   Mean   : 25037   Mean   :10218   Mean   : 3618.8  
##  3rd Qu.:7.000   3rd Qu.: 27692   3rd Qu.:11267   3rd Qu.: 4318.7  
##  Max.   :9.000   Max.   :140217   Max.   :47699   Max.   :24592.3  
##      util               pc               gsp              emp         
##  Min.   :  538.5   Min.   :  4053   Min.   :  4354   Min.   :  108.3  
##  1st Qu.: 2488.3   1st Qu.: 21651   1st Qu.: 16502   1st Qu.:  475.0  
##  Median : 7008.8   Median : 40671   Median : 39987   Median : 1164.8  
##  Mean   :11199.5   Mean   : 58188   Mean   : 61014   Mean   : 1747.1  
##  3rd Qu.:11598.5   3rd Qu.: 64796   3rd Qu.: 68126   3rd Qu.: 2114.1  
##  Max.   :80728.1   Max.   :375342   Max.   :464550   Max.   :11258.0  
##      unemp       
##  Min.   : 2.800  
##  1st Qu.: 5.000  
##  Median : 6.200  
##  Mean   : 6.602  
##  3rd Qu.: 7.900  
##  Max.   :18.000  
## dimension(s):
##       from to offset delta refsys point              values x/y
## state    1 48     NA    NA     NA    NA ALABAMA,...,WYOMING [x]
## year     1 17   1986    -1     NA    NA                NULL [y]
s = st_apply(s, 1, function(x) x/mean(x)) %>%
  st_set_dimensions(names = c("year", "state"))
s
## stars object with 2 dimensions and 9 attributes
## attribute(s):
##     region       pcap              hwy             water       
##  Min.   :1   Min.   :0.6705   Min.   :0.6181   Min.   :0.5294  
##  1st Qu.:1   1st Qu.:0.9327   1st Qu.:0.9646   1st Qu.:0.8626  
##  Median :1   Median :1.0146   Median :1.0088   Median :0.9968  
##  Mean   :1   Mean   :1.0000   Mean   :1.0000   Mean   :1.0000  
##  3rd Qu.:1   3rd Qu.:1.0644   3rd Qu.:1.0413   3rd Qu.:1.1286  
##  Max.   :1   Max.   :1.4160   Max.   :1.1928   Max.   :1.6239  
##      util              pc               gsp              emp        
##  Min.   :0.5385   Min.   :0.6231   Min.   :0.6215   Min.   :0.6055  
##  1st Qu.:0.9236   1st Qu.:0.8718   1st Qu.:0.8921   1st Qu.:0.9065  
##  Median :1.0178   Median :1.0041   Median :1.0012   Median :1.0155  
##  Mean   :1.0000   Mean   :1.0000   Mean   :1.0000   Mean   :1.0000  
##  3rd Qu.:1.0727   3rd Qu.:1.1204   3rd Qu.:1.0939   3rd Qu.:1.0920  
##  Max.   :1.7289   Max.   :1.4786   Max.   :1.6435   Max.   :1.4798  
##      unemp       
##  Min.   :0.4340  
##  1st Qu.:0.7989  
##  Median :0.9633  
##  Mean   :1.0000  
##  3rd Qu.:1.1732  
##  Max.   :1.9416  
## dimension(s):
##       from to offset delta refsys point              values
## year     1 17     NA    NA     NA    NA                NULL
## state    1 48     NA    NA     NA    NA ALABAMA,...,WYOMING
pr = as.data.frame(s)
head(pr)
##   year   state region     pcap      hwy    water     util       pc      gsp
## 1    1 ALABAMA      1 1.099395 1.061091 1.166170 1.123685 1.220994 1.269043
## 2    2 ALABAMA      1 1.083229 1.050617 1.137907 1.104474 1.202354 1.228148
## 3    3 ALABAMA      1 1.073425 1.042153 1.128882 1.093014 1.177764 1.182770
## 4    4 ALABAMA      1 1.065874 1.036919 1.126735 1.081551 1.166810 1.107454
## 5    5 ALABAMA      1 1.065665 1.040853 1.119124 1.078759 1.149528 1.057200
## 6    6 ALABAMA      1 1.065680 1.038889 1.120675 1.080527 1.119022 1.077569
##        emp    unemp
## 1 1.164437 1.217836
## 2 1.135630 1.105994
## 3 1.104277 1.366959
## 4 1.057407 1.739766
## 5 1.044436 1.739766
## 6 1.072367 1.366959
ggplot(pr) + geom_raster(aes(y = state, x = year, fill = pcap))

We can also match the state names to state geometries:

library(maps) 
states.m = map('state', plot=FALSE, fill=TRUE)
#IDs <- sapply(strsplit(states.m$names, ":"), function(x) x[1])
st = st_as_sf(states.m)[-8,] # deselect D.C., not in Produc
match(tolower(st_get_dimension_values(s, "state")), sub(" ", "_", st$ID))
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 NA 41 42 43 44 45 46 47 48

this indicates that we do not have to reorder the states... then we can simply set them:

(s_sf = st_set_dimensions(s, "state", values = st_geometry(st)))
## stars object with 2 dimensions and 9 attributes
## attribute(s):
##     region       pcap              hwy             water       
##  Min.   :1   Min.   :0.6705   Min.   :0.6181   Min.   :0.5294  
##  1st Qu.:1   1st Qu.:0.9327   1st Qu.:0.9646   1st Qu.:0.8626  
##  Median :1   Median :1.0146   Median :1.0088   Median :0.9968  
##  Mean   :1   Mean   :1.0000   Mean   :1.0000   Mean   :1.0000  
##  3rd Qu.:1   3rd Qu.:1.0644   3rd Qu.:1.0413   3rd Qu.:1.1286  
##  Max.   :1   Max.   :1.4160   Max.   :1.1928   Max.   :1.6239  
##      util              pc               gsp              emp        
##  Min.   :0.5385   Min.   :0.6231   Min.   :0.6215   Min.   :0.6055  
##  1st Qu.:0.9236   1st Qu.:0.8718   1st Qu.:0.8921   1st Qu.:0.9065  
##  Median :1.0178   Median :1.0041   Median :1.0012   Median :1.0155  
##  Mean   :1.0000   Mean   :1.0000   Mean   :1.0000   Mean   :1.0000  
##  3rd Qu.:1.0727   3rd Qu.:1.1204   3rd Qu.:1.0939   3rd Qu.:1.0920  
##  Max.   :1.7289   Max.   :1.4786   Max.   :1.6435   Max.   :1.4798  
##      unemp       
##  Min.   :0.4340  
##  1st Qu.:0.7989  
##  Median :0.9633  
##  Mean   :1.0000  
##  3rd Qu.:1.1732  
##  Max.   :1.9416  
## dimension(s):
##      from to offset delta refsys point
## year    1 17     NA    NA     NA    NA
## sfc     1 48     NA    NA WGS 84 FALSE
##                                                                 values
## year                                                              NULL
## sfc  MULTIPOLYGON (((-87.46201 3...,...,MULTIPOLYGON (((-109.0511 4...
# plot a single time slice:
(s_sf_y1 = s_sf[,1])
## stars object with 2 dimensions and 9 attributes
## attribute(s):
##     region       pcap              hwy             water           util        
##  Min.   :1   Min.   :0.9677   Min.   :0.9235   Min.   :1.012   Min.   :0.9296  
##  1st Qu.:1   1st Qu.:1.0470   1st Qu.:1.0195   1st Qu.:1.162   1st Qu.:1.0174  
##  Median :1   Median :1.0911   Median :1.0613   Median :1.241   Median :1.0887  
##  Mean   :1   Mean   :1.1183   Mean   :1.0654   Mean   :1.249   Mean   :1.1374  
##  3rd Qu.:1   3rd Qu.:1.1593   3rd Qu.:1.1196   3rd Qu.:1.306   3rd Qu.:1.2134  
##  Max.   :1   Max.   :1.4160   Max.   :1.1928   Max.   :1.624   Max.   :1.7289  
##       pc               gsp              emp            unemp       
##  Min.   :0.9806   Min.   :0.9794   Min.   :1.012   Min.   :0.5657  
##  1st Qu.:1.1667   1st Qu.:1.1610   1st Qu.:1.115   1st Qu.:0.8593  
##  Median :1.2175   Median :1.2304   Median :1.168   Median :1.0205  
##  Mean   :1.2411   Mean   :1.2426   Mean   :1.181   Mean   :1.0569  
##  3rd Qu.:1.3299   3rd Qu.:1.3263   3rd Qu.:1.250   3rd Qu.:1.2191  
##  Max.   :1.4786   Max.   :1.6435   Max.   :1.480   Max.   :1.8478  
## dimension(s):
##      from to offset delta refsys point
## year    1  1     NA    NA     NA    NA
## sfc     1 48     NA    NA WGS 84 FALSE
##                                                                 values
## year                                                              NULL
## sfc  MULTIPOLYGON (((-87.46201 3...,...,MULTIPOLYGON (((-109.0511 4...
# drop the obsolete dimension:
(s_sf_y1 = adrop(s_sf_y1))
## stars object with 1 dimensions and 9 attributes
## attribute(s):
##     region       pcap              hwy             water           util        
##  Min.   :1   Min.   :0.9677   Min.   :0.9235   Min.   :1.012   Min.   :0.9296  
##  1st Qu.:1   1st Qu.:1.0470   1st Qu.:1.0195   1st Qu.:1.162   1st Qu.:1.0174  
##  Median :1   Median :1.0911   Median :1.0613   Median :1.241   Median :1.0887  
##  Mean   :1   Mean   :1.1183   Mean   :1.0654   Mean   :1.249   Mean   :1.1374  
##  3rd Qu.:1   3rd Qu.:1.1593   3rd Qu.:1.1196   3rd Qu.:1.306   3rd Qu.:1.2134  
##  Max.   :1   Max.   :1.4160   Max.   :1.1928   Max.   :1.624   Max.   :1.7289  
##       pc               gsp              emp            unemp       
##  Min.   :0.9806   Min.   :0.9794   Min.   :1.012   Min.   :0.5657  
##  1st Qu.:1.1667   1st Qu.:1.1610   1st Qu.:1.115   1st Qu.:0.8593  
##  Median :1.2175   Median :1.2304   Median :1.168   Median :1.0205  
##  Mean   :1.2411   Mean   :1.2426   Mean   :1.181   Mean   :1.0569  
##  3rd Qu.:1.3299   3rd Qu.:1.3263   3rd Qu.:1.250   3rd Qu.:1.2191  
##  Max.   :1.4786   Max.   :1.6435   Max.   :1.480   Max.   :1.8478  
## dimension(s):
##     from to offset delta refsys point
## sfc    1 48     NA    NA WGS 84 FALSE
##                                                                values
## sfc MULTIPOLYGON (((-87.46201 3...,...,MULTIPOLYGON (((-109.0511 4...
# convert to sf:
sf = st_as_sf(s_sf_y1)
plot(sf)

Vector data cube: North Carolina SIDS

The North Carolina SIDS (sudden infant death syndrome) data set, introduced here, is an epidemiological data set containing population (births, BIR), disease cases (SID), and non-white births (NWBIR) for two periods, indicated by the start years 1974 and 1979. The population and time information is spread over columns in a data.frame:

nc = read_sf(system.file("gpkg/nc.gpkg", package="sf"))
nc.df = st_set_geometry(nc, NULL) # m is a regular, non-spatial data.frame
head(nc.df)
## # A tibble: 6 x 14
##    AREA PERIMETER CNTY_ CNTY_ID NAME  FIPS  FIPSNO CRESS_ID BIR74 SID74 NWBIR74
##   <dbl>     <dbl> <dbl>   <dbl> <chr> <chr>  <dbl>    <int> <dbl> <dbl>   <dbl>
## 1 0.114      1.44  1825    1825 Ashe  37009  37009        5  1091     1      10
## 2 0.061      1.23  1827    1827 Alle… 37005  37005        3   487     0      10
## 3 0.143      1.63  1828    1828 Surry 37171  37171       86  3188     5     208
## 4 0.07       2.97  1831    1831 Curr… 37053  37053       27   508     1     123
## 5 0.153      2.21  1832    1832 Nort… 37131  37131       66  1421     9    1066
## 6 0.097      1.67  1833    1833 Hert… 37091  37091       46  1452     7     954
## # … with 3 more variables: BIR79 <dbl>, SID79 <dbl>, NWBIR79 <dbl>

We now mold this table into a 100 (counties) x 3 (categories) x 2 (years) array:

mat = as.matrix(nc.df[c("BIR74", "SID74", "NWBIR74", "BIR79", "SID79", "NWBIR79")])
dim(mat) = c(county = 100, var = 3, year = 2) # make it a 3-dimensional array
# set dimension values to the array:
dimnames(mat) = list(county = nc$NAME, var = c("BIR", "SID", "NWBIR"), year = c(1974, 1979))
# convert array into a stars object
(nc.st = st_as_stars(pop = mat))
## stars object with 3 dimensions and 1 attribute
## attribute(s):
##       pop       
##  Min.   :    0  
##  1st Qu.:    8  
##  Median :  538  
##  Mean   : 1657  
##  3rd Qu.: 1784  
##  Max.   :30757  
## dimension(s):
##        from  to offset delta refsys point              values
## county    1 100     NA    NA     NA    NA  Ashe,...,Brunswick
## var       1   3     NA    NA     NA    NA BIR  , SID  , NWBIR
## year      1   2     NA    NA     NA    NA          1974, 1979

after which we can replace the county names with the county geometries:

(nc.geom <- st_set_dimensions(nc.st, 1, st_geometry(nc)))
## stars object with 3 dimensions and 1 attribute
## attribute(s):
##       pop       
##  Min.   :    0  
##  1st Qu.:    8  
##  Median :  538  
##  Mean   : 1657  
##  3rd Qu.: 1784  
##  Max.   :30757  
## dimension(s):
##      from  to offset delta refsys point
## sfc     1 100     NA    NA  NAD27 FALSE
## var     1   3     NA    NA     NA    NA
## year    1   2     NA    NA     NA    NA
##                                                                 values
## sfc  MULTIPOLYGON (((-81.47276 3...,...,MULTIPOLYGON (((-78.65572 3...
## var                                                BIR  , SID  , NWBIR
## year                                                        1974, 1979

Note that we have now two fields filled for the sfc (simple feature geometry) dimension: refsys and points. What do they mean?

We can compute and plot the sums over the years for each county (1) and variable (2):

plot(st_apply(nc.geom, c(1,2), sum), key.pos = 4) # sum population over year

In order to meaningfully compare disease cases, we standardise incidence rates (SIR), by

\[\mbox{SIR}_i=\frac{c_i/p_i}{\sum_i c_i / \sum_i p_i}\]

with \(c_i\) the incidences and \(p_i\) the corresponding population of county \(i\). For SIR, the value one indicates equality to the mean incidence rate. We first compute the global incidence \(m\):

# split out BIR, SID, NWBIR over attributes:
split(nc.geom, 2) 
## stars object with 2 dimensions and 3 attributes
## attribute(s):
##       BIR             SID             NWBIR      
##  Min.   :  248   Min.   : 0.000   Min.   :    1  
##  1st Qu.: 1177   1st Qu.: 2.000   1st Qu.:  206  
##  Median : 2265   Median : 5.000   Median :  742  
##  Mean   : 3762   Mean   : 7.515   Mean   : 1202  
##  3rd Qu.: 4451   3rd Qu.: 9.000   3rd Qu.: 1316  
##  Max.   :30757   Max.   :57.000   Max.   :11631  
## dimension(s):
##      from  to offset delta refsys point
## sfc     1 100     NA    NA  NAD27 FALSE
## year    1   2     NA    NA     NA    NA
##                                                                 values
## sfc  MULTIPOLYGON (((-81.47276 3...,...,MULTIPOLYGON (((-78.65572 3...
## year                                                        1974, 1979
# sum by category:
(nc.sum = sapply(split(nc.geom, 2), sum)) 
##    BIR    SID  NWBIR 
## 752354   1503 240362
# denominator: mean incidence ratio, averaged over county & years:
(IR = nc.sum[2]/nc.sum[1]) 
##        SID 
## 0.00199773
# standardise each year/counte value by dividing over IR:
nc.SIR = st_apply(nc.geom, c(1,3), function(x) (x[2]/x[1])/IR)
plot(nc.SIR, breaks = c(0,.25,.5,.75,.9,1.1,1.5,2.5,3.5,5), 
  pal = rev(RColorBrewer::brewer.pal(9, "RdBu")))

#library(mapview)
#mapview(breweries)

Vector data cube with two space dimensions: OD matrices

See section 4.3.2 here.

Vector data cubes: why?

Why don't we work on regular data.frame or tbl_df tables?

  • with arrays, it is immediately clear where the missing values are; with tables the "empty" records may be omitted (and not NA or 0 filled)
  • with tables, you may have duplicate records; no such thing with arrays
  • with tables, order of records is not given a priori; with arrays lookup is much faster, as each dimensions gives an index
  • with arrays you can directly operate over a dimension, or reduce it (e.g. taking the time mean); with tables this is much harder.
  • vector data cubes arise naturally when sampling raster data cubes (up next), or aggregating raster data cubes over polygons

Why not?

  • tables may be more efficient, memory-wise, if an array is very sparse (many NA or 0 values)
  • rather than be smart, use raw power (Google Big Query GIS; look how Carto analyses raster as vector data)

Raster data cubes: raster layer

(g = read_stars(system.file("external/test.grd", package = "raster")))
## stars object with 2 dimensions and 1 attribute
## attribute(s):
##    test.grd      
##  Min.   : 138.7  
##  1st Qu.: 294.0  
##  Median : 371.9  
##  Mean   : 425.6  
##  3rd Qu.: 501.0  
##  Max.   :1736.1  
##  NA's   :6022    
## dimension(s):
##   from  to offset delta                       refsys point values x/y
## x    1  80 178400    40 +proj=sterea +lat_0=52.15...    NA   NULL [x]
## y    1 115 334000   -40 +proj=sterea +lat_0=52.15...    NA   NULL [y]
plot(g, col = viridis::viridis(11), breaks = "equal")

Note that:

  • raster files are data cubes: the attribute has values for all combinations of a set of x and y values
  • where vector data cubes have x/y space features in a single dimension, raster data cubes spread x and y over two (x-y raster cells) or three separate (x-y-z voxels) dimensions.
  • this particular example is a regular grid: offset and delta are filled, and geographic coordinates can be computed, e.g. for the x dimension, from 1-based cell index \(i\) by \[x_i = \mbox{offset}_x + (i-1) * \mbox{delta}_x\]
  • this way, index 1 gives the edge, and 1.5 the center of the first grid cell.
  • delta for y is typically negative: going south row indexes increase while y coordinates decrease
  • the dimensions have a refsys (here: a deprecated Proj4 string), allowing a match to other world coordinates (e.g. plotting with leaflet/mapview)
  • what you see are the non-NA cells, the NA cells are not drawn, but they are there; we can show them by converting to features, drawing cell borders grey:
(g.sf = st_as_sf(g, na.rm = FALSE))
## Simple feature collection with 9200 features and 1 field
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 178400 ymin: 329400 xmax: 181600 ymax: 334000
## CRS:            unknown
## First 10 features:
##    test.grd                       geometry
## 1        NA POLYGON ((178400 334000, 17...
## 2        NA POLYGON ((178440 334000, 17...
## 3        NA POLYGON ((178480 334000, 17...
## 4        NA POLYGON ((178520 334000, 17...
## 5        NA POLYGON ((178560 334000, 17...
## 6        NA POLYGON ((178600 334000, 17...
## 7        NA POLYGON ((178640 334000, 17...
## 8        NA POLYGON ((178680 334000, 17...
## 9        NA POLYGON ((178720 334000, 17...
## 10       NA POLYGON ((178760 334000, 17...
plot(g.sf, border = 'grey', pal = viridis::viridis(9), nbreaks = 10)

Raster data cubes: multiple layers

A lot of raster information comes as multi-layer, the simplest being rgb images:

(r = read_stars(system.file("pictures/Rlogo.jpg", package = "rgdal"))) # the old one
## stars object with 3 dimensions and 1 attribute
## attribute(s):
##    Rlogo.jpg    
##  Min.   :  8.0  
##  1st Qu.:181.0  
##  Median :255.0  
##  Mean   :207.9  
##  3rd Qu.:255.0  
##  Max.   :255.0  
## dimension(s):
##      from  to offset delta refsys point values x/y
## x       1 200      0     1     NA    NA   NULL [x]
## y       1 175    175    -1     NA    NA   NULL [y]
## band    1   3     NA    NA     NA    NA   NULL
plot(r, breaks = "equal")

Obviously, such data can be plotted as colors; we will first convert the rgb values to R color values:

(r.rgb = st_rgb(r))
## stars object with 2 dimensions and 1 attribute
## attribute(s):
##     rgb3           
##  Length:35000      
##  Class :character  
##  Mode  :character  
## dimension(s):
##   from  to offset delta refsys point values x/y
## x    1 200      0     1     NA    NA   NULL [x]
## y    1 175    175    -1     NA    NA   NULL [y]
r.rgb[[1]][1:3,1]
## [1] "#FFFFFF" "#FFFFFF" "#FFFFFF"

before plotting it:

plot(r.rgb)

Multi-band data is much more common and goes beyond rgb; an exerpt with the 30m bands of a Landsat-7 image is found here:

L7file = system.file("tif/L7_ETMs.tif", package = "stars")
(L7 = read_stars(L7file))
## stars object with 3 dimensions and 1 attribute
## attribute(s):
##   L7_ETMs.tif    
##  Min.   :  1.00  
##  1st Qu.: 54.00  
##  Median : 69.00  
##  Mean   : 68.91  
##  3rd Qu.: 86.00  
##  Max.   :255.00  
## dimension(s):
##      from  to  offset delta                       refsys point values x/y
## x       1 349  288776  28.5 UTM Zone 25, Southern Hem... FALSE   NULL [x]
## y       1 352 9120761 -28.5 UTM Zone 25, Southern Hem... FALSE   NULL [y]
## band    1   6      NA    NA                           NA    NA   NULL

Plotting this uses histogram stretching over all bands. I think of histogram stretching as HDR in monochrome attribute space: each grey tone covers the same area, color breaks are quantiles of the map layer:

plot(L7)

We can also do the stretching over each band individually, meaning there is no common key:

plot(L7, join_zlim = FALSE)

From these bands we can also make color or false color composites:

par(mfrow = c(1, 2))
plot(L7, rgb = c(3,2,1), reset = FALSE, main = "RGB")    # rgb
plot(L7, rgb = c(4,3,2), main = "False color (NIR-R-G)") # false color

Transforming and warping rasters

Suppose we create a 1-degree grid over parts of Europe:

bb = st_bbox(c(xmin = -10, xmax = 20, ymin = 40, ymax = 60), crs = 4326)
(x = st_as_stars(bb, dx = 1, dy = 1))
## stars object with 2 dimensions and 1 attribute
## attribute(s):
##     values  
##  Min.   :0  
##  1st Qu.:0  
##  Median :0  
##  Mean   :0  
##  3rd Qu.:0  
##  Max.   :0  
## dimension(s):
##   from to offset delta refsys point values x/y
## x    1 30    -10     1 WGS 84    NA   NULL [x]
## y    1 20     60    -1 WGS 84    NA   NULL [y]

We can plot the grid outline e.g. by:

#sf_use_s2(FALSE)
library(rnaturalearth)
ne <- ne_countries(returnclass = "sf", continent = 'europe') %>%
    st_set_precision(1e8)
ne$pop_dens = units::set_units(ne$pop_est / st_area(ne), 1/(km^2))
plot(ne["pop_dens"], reset = FALSE, extent = bb)
pop = st_rasterize(ne["pop_dens"], x)
plot(st_as_sf(pop, na.rm = FALSE), add = TRUE, border = 'grey')

We can transform this grid e.g. to the ETRS89 / LAEA projection:

(pop.3035 = st_transform(pop, 3035))
## stars object with 2 dimensions and 1 attribute
## attribute(s):
##  pop_dens [1/km^2]
##  Min.   :  0.00   
##  1st Qu.:  0.00   
##  Median : 78.03   
##  Mean   : 80.40   
##  3rd Qu.:123.98   
##  Max.   :417.65   
## dimension(s):
##   from to offset delta                       refsys point
## x    1 30     NA    NA ETRS89-extended / LAEA Eu... FALSE
## y    1 20     NA    NA ETRS89-extended / LAEA Eu... FALSE
##                        values x/y
## x [30x20] 2680703,...,5127800 [x]
## y [30x20] 1933849,...,4195484 [y]
## curvilinear grid
ne.3035 = st_transform(ne, 3035)
plot(pop.3035, border = 'grey', reset = FALSE)
plot(st_geometry(ne.3035), add = TRUE, border = 'yellow')

Note that the transformed grid is no longer a regular grid (it does not have offset and delta values for x and y), but a curvilinear grid: for every grid cell the longitude/latitude pair is stored. All the grid cell values are exactly retained, their geometry only changed reference system!

If we need to work this curvilinear grid to a regular grid in the new reference system, we can either warp it:

target_grid = st_as_stars(st_bbox(pop.3035)) # add dimensions/cell sizes etc here
(w = st_warp(pop, target_grid)) # or give only a target crs
## stars object with 2 dimensions and 1 attribute
## attribute(s):
##  pop_dens [1/km^2]
##  Min.   :  0.00   
##  1st Qu.:  0.00   
##  Median : 80.68   
##  Mean   : 82.27   
##  3rd Qu.:123.98   
##  Max.   :417.65   
##  NA's   :10148    
## dimension(s):
##   from  to  offset    delta                       refsys point values x/y
## x    1 265 2680703  9241.65 ETRS89-extended / LAEA Eu...    NA   NULL [x]
## y    1 245 4195484 -9241.65 ETRS89-extended / LAEA Eu...    NA   NULL [y]
plot(w, border = 'grey', reset = FALSE)
plot(st_geometry(ne.3035), add = TRUE, border = 'yellow')

Suppose we had worked with population rather than population density, this warping would have caused a shift in total population; another approach would have been to use sf::st_interpolate_aw for area-weighted interpolation for this.

Raster data cubes: time and multiple attributes

A lot of Earth observation and modelling data from domains such as weather, climate, hydrology, oceanography come with a time dimension. Quite commonly, data cubes are stored in NetCDF (or HDF4, HDF5) formats. We have two examples; the first is a time series with two attributes, concerning monthly total precipitation, downscaled from CMIP:

(w <- system.file("nc/bcsd_obs_1999.nc", package = "stars") %>%
    read_stars("data/full_data_daily_2013.nc"))
## pr, tas,
## stars object with 3 dimensions and 2 attributes
## attribute(s):
##    pr [mm/m]         tas [C]      
##  Min.   :  0.59   Min.   :-0.421  
##  1st Qu.: 56.14   1st Qu.: 8.899  
##  Median : 81.88   Median :15.658  
##  Mean   :101.26   Mean   :15.489  
##  3rd Qu.:121.07   3rd Qu.:21.780  
##  Max.   :848.55   Max.   :29.386  
##  NA's   :7116     NA's   :7116    
## dimension(s):
##      from to offset  delta  refsys point                    values x/y
## x       1 81    -85  0.125      NA    NA                      NULL [x]
## y       1 33 37.125 -0.125      NA    NA                      NULL [y]
## time    1 12     NA     NA POSIXct    NA 1999-01-31,...,1999-12-31
plot(w)

The second concerns a time series split over multiple files,

x = c(
"avhrr-only-v2.19810901.nc",
"avhrr-only-v2.19810902.nc",
"avhrr-only-v2.19810903.nc",
"avhrr-only-v2.19810904.nc",
"avhrr-only-v2.19810905.nc",
"avhrr-only-v2.19810906.nc",
"avhrr-only-v2.19810907.nc",
"avhrr-only-v2.19810908.nc",
"avhrr-only-v2.19810909.nc"
)
# see the second vignette:
# install.packages("starsdata", repos = "http://gis-bigdata.uni-muenster.de", type = "source")
file_list = system.file(paste0("netcdf/", x), package = "starsdata")
(y = read_stars(file_list, quiet = TRUE))
## stars object with 4 dimensions and 4 attributes
## attribute(s), summary of first 1e+05 cells:
##    sst [°*C]       anom [°*C]      err [°*C]     ice [percent]  
##  Min.   :-1.80   Min.   :-4.69   Min.   :0.110   Min.   :0.010  
##  1st Qu.:-1.19   1st Qu.:-0.06   1st Qu.:0.300   1st Qu.:0.730  
##  Median :-1.05   Median : 0.52   Median :0.300   Median :0.830  
##  Mean   :-0.32   Mean   : 0.23   Mean   :0.295   Mean   :0.766  
##  3rd Qu.:-0.20   3rd Qu.: 0.71   3rd Qu.:0.300   3rd Qu.:0.870  
##  Max.   : 9.36   Max.   : 3.70   Max.   :0.480   Max.   :1.000  
##  NA's   :13360   NA's   :13360   NA's   :13360   NA's   :27377  
## dimension(s):
##      from   to                   offset  delta  refsys point values x/y
## x       1 1440                        0   0.25      NA    NA   NULL [x]
## y       1  720                       90  -0.25      NA    NA   NULL [y]
## zlev    1    1                    0 [m]     NA      NA    NA   NULL    
## time    1    9 1981-09-01 02:00:00 CEST 1 days POSIXct    NA   NULL

This variable contains a singular dimension, zlev, which we can remove using adrop:

(z = adrop(y))
## stars object with 3 dimensions and 4 attributes
## attribute(s), summary of first 1e+05 cells:
##    sst [°*C]       anom [°*C]      err [°*C]     ice [percent]  
##  Min.   :-1.80   Min.   :-4.69   Min.   :0.110   Min.   :0.010  
##  1st Qu.:-1.19   1st Qu.:-0.06   1st Qu.:0.300   1st Qu.:0.730  
##  Median :-1.05   Median : 0.52   Median :0.300   Median :0.830  
##  Mean   :-0.32   Mean   : 0.23   Mean   :0.295   Mean   :0.766  
##  3rd Qu.:-0.20   3rd Qu.: 0.71   3rd Qu.:0.300   3rd Qu.:0.870  
##  Max.   : 9.36   Max.   : 3.70   Max.   :0.480   Max.   :1.000  
##  NA's   :13360   NA's   :13360   NA's   :13360   NA's   :27377  
## dimension(s):
##      from   to                   offset  delta  refsys point values x/y
## x       1 1440                        0   0.25      NA    NA   NULL [x]
## y       1  720                       90  -0.25      NA    NA   NULL [y]
## time    1    9 1981-09-01 02:00:00 CEST 1 days POSIXct    NA   NULL

which we can for instance plot with ggplot:

library(ggplot2)
library(viridis)
## Loading required package: viridisLite
## Loading required package: viridisLite
library(ggthemes)
ggplot() +
  geom_stars(data = z[1], alpha = 0.8, downsample = c(10, 10, 1)) +
  facet_wrap("time") +
  scale_fill_viridis() +
  coord_equal() +
  theme_map() +
  theme(legend.position = "bottom") +
  theme(legend.key.width = unit(2, "cm"))