Catégories
Non classé

Using the geofacet package to spatially arrange plots

The {geofacet} package allows to « arrange a sequence of plots of data for different geographical entities into a grid that strives to preserve some of the original geographical orientation of the entities ».

Like the previous post, it’s interesting if you view each entity as a unit and don’t care for its real size or weight, and don’t want to spend too much time manually finding the best grid.

We will again use the same COVID-19 dataset. We manually add the overseas départements once we have found the right grid (by trying different seeds) and adjust Corsica position.

COVID-19 deceased in hospital, by département, for 100 000 inhab.
# packages ----------------------------------------------------------------
library(tidyverse)
library(httr)
library(fs)
library(sf)
library(readxl)
library(janitor)
library(glue)
library(geofacet)
# also install ragg

# sources -----------------------------------------------------------------

# https://www.data.gouv.fr/fr/datasets/donnees-hospitalieres-relatives-a-lepidemie-de-covid-19/
fichier_covid <- "donnees/covid.csv"
url_donnees_covid <- "https://www.data.gouv.fr/fr/datasets/r/63352e38-d353-4b54-bfd1-f1b3ee1cabd7"

# https://www.insee.fr/fr/statistiques/2012713#tableau-TCRD_004_tab1_departements
fichier_pop <- "donnees/pop.xls"
url_donnees_pop <- "https://www.insee.fr/fr/statistiques/fichier/2012713/TCRD_004.xls"

# Adminexpress : à télécharger manuellement
# https://geoservices.ign.fr/documentation/diffusion/telechargement-donnees-libres.html#admin-express
aex <- path_expand("~/Downloads/ADMIN-EXPRESS_2-2__SHP__FRA_2020-02-24/ADMIN-EXPRESS/1_DONNEES_LIVRAISON_2020-02-24")


# config ------------------------------------------------------------------

options(scipen = 999)

force_download <- FALSE # retélécharger même si le fichier existe et a été téléchargé aujourd'hui ?


# téléchargement -------------------------------------------------

if (!dir_exists("donnees")) dir_create("donnees")
if (!dir_exists("resultats")) dir_create("resultats")
if (!dir_exists("resultats/animation_spf")) dir_create("resultats/animation_spf")

if (!file_exists(fichier_covid) |
    file_info(fichier_covid)$modification_time < Sys.Date() |
    force_download) {
  GET(url_donnees_covid,
      progress(),
      write_disk(fichier_covid, overwrite = TRUE)) %>%
    stop_for_status()
}

if (!file_exists(fichier_pop)) {
  GET(url_donnees_pop,
      progress(),
      write_disk(fichier_pop)) %>%
    stop_for_status()
}

covid <- read_csv2(fichier_covid)

pop <- read_xls(fichier_pop, skip = 2) %>%
  clean_names()

# adminexpress prétéléchargé
dep <- read_sf(path(aex, "ADE_2-2_SHP_LAMB93_FR/DEPARTEMENT.shp")) %>%
  clean_names() %>%
  st_set_crs(2154)


# construction de la grille ----------------------------------------

grid_fr <- dep %>%
  select(insee_dep, nom_dep) %>%
  grid_auto(names = "nom_dep", codes = "insee_dep", seed = 4) %>%
  add_row(row = 8,
          col = 1,
          name_nom_dep = "Guadeloupe",
          code_insee_dep = "971") %>%
  add_row(row = 9,
          col = 1,
          name_nom_dep = "Martinique",
          code_insee_dep = "972") %>%
  add_row(row = 10,
          col = 1,
          name_nom_dep = "Guyane",
          code_insee_dep = "973") %>%
  add_row(row = 7,
          col = 13,
          name_nom_dep = "Mayotte",
          code_insee_dep = "976") %>%
  add_row(row = 8,
          col = 13,
          name_nom_dep = "La Réunion",
          code_insee_dep = "974")

grid_fr[grid_fr$code_insee_dep %in% c("2A", "2B"), "col"] <- 13
grid_fr[grid_fr$code_insee_dep %in% c("2A", "2B"), "row"] <- grid_fr[grid_fr$code_insee_dep %in% c("2A", "2B"), "row"] - 1


# graphique -----------------------------------------------------

df <- covid %>%
  filter(sexe == 0) %>%
  rename(deces = dc,
         reanim = rea,
         hospit = hosp) %>%
  left_join(pop,
            by = c("dep" = "x1")) %>%
  mutate(incidence = deces / x2020_p * 100000) %>%
  rename(insee_dep = dep) %>%
  left_join(grid_fr %>%
              select(nom_dep = name_nom_dep,
                     insee_dep = code_insee_dep)) %>%
  drop_na(insee_dep) %>%
  ggplot(aes(jour, incidence)) +
    geom_area() +
    facet_geo(~ nom_dep, grid = grid_fr) +
    labs(title = "Mortalité",
       subtitle = "COVID-19 - France",
       x = "date",
       y = "décès pour\n100 000 hab.",
       caption = glue("http://r.iresmi.net/\ndonnées SPF {Sys.Date()}")) +
    theme_minimal() +
    theme(strip.text = element_text(hjust = 0, size = 7))

ggsave(glue("resultats/covid_fr_mortalite_geofacette_{Sys.Date()}.png"),
       width = 25, height = 20, units = "cm", scaling = .8, res = 300, device = ragg::agg_png)

Catégories
Non classé

Generalized Monty Hall problem

A simulation of the Monty Hall problem outcomes for n doors (k opened) à la Tidyverse…

library(tidyverse)

# sample vectors whether they have one or more elements
resample <- function(x, ...) x[sample.int(length(x), ...)]

monty <- function(doors = 3, monty_opens_doors = 1, n = 10000, seed = 0) {
	set.seed(seed)
	tibble(car = sample(doors, n, replace = TRUE),
	       choice = sample(doors, n, replace = TRUE)) %>% 
	  rowwise() %>% 
	  mutate(monty_chose = list(resample(setdiff(1:doors, c(car, choice)), monty_opens_doors)),
	         win_no_switch = car == choice,
	         win_switch = car == resample(setdiff(1:doors, unlist(c(choice, monty_chose))), 1)) %>% 
	  ungroup() %>% 
	  summarise(win_if_not_switching = sum(win_no_switch) / n() * 100,
	            win_with_switching = sum(win_switch) / n() * 100)
}
> monty() # classic values
# A tibble: 1 x 2
  win_if_not_switching win_with_switching
                 <dbl>              <dbl>
1                 33.4               66.6
> monty(10) # more doors (10), 1 opened
# A tibble: 1 x 2
  win_if_not_switching win_with_switching
                 <dbl>              <dbl>
1                 10.4               11.0
> monty(10, 3) # 10 doors, 3 opened
# A tibble: 1 x 2
  win_if_not_switching win_with_switching
                 <dbl>              <dbl>
1                 10.4               15.2

So, switch…

Catégories
Non classé

Polygons to hexagons

Hexagon tessellation using the great {geogrid} package.

The départements are the second level of administrative government in France. They neither have the same area nor the same population and this heterogeneity provides a few challenges for a fair and accurate map representation (see the post on smoothing).

However if we are just interested in the départements as units, we can use a regular grid for visualization. Since France is often called the hexagon, we could even use an hexagon tiling (a fractal map !)…

Creating the grid and conserving minimal topological relations and the general shape can be time consuming, but thanks to Geogrid it’s quite easy. The geogrid dev page provides nice examples. We will reuse our code of the COVID19 animation. The resulting GIS file is provided below.

# Carto décès COVID 19 hexagones
# France métro. + DOM
# Animation
# DONNEES SPF


# packages ----------------------------------------------------------------
library(tidyverse)
library(httr)
library(fs)
library(sf)
library(readxl)
library(janitor)
library(glue)
library(tmap)
library(grid)
library(classInt)
library(magick)
library(geogrid)


# sources -----------------------------------------------------------------

# https://www.data.gouv.fr/fr/datasets/donnees-hospitalieres-relatives-a-lepidemie-de-covid-19/
fichier_covid <- "donnees/covid.csv"
url_donnees_covid <- "https://www.data.gouv.fr/fr/datasets/r/63352e38-d353-4b54-bfd1-f1b3ee1cabd7"

# https://www.insee.fr/fr/statistiques/2012713#tableau-TCRD_004_tab1_departements
fichier_pop <- "donnees/pop.xls"
url_donnees_pop <- "https://www.insee.fr/fr/statistiques/fichier/2012713/TCRD_004.xls"

# Adminexpress : à télécharger manuellement
# https://geoservices.ign.fr/documentation/diffusion/telechargement-donnees-libres.html#admin-express
aex <- path_expand("~/Downloads/ADMIN-EXPRESS_2-2__SHP__FRA_2020-02-24/ADMIN-EXPRESS/1_DONNEES_LIVRAISON_2020-02-24")


# config ------------------------------------------------------------------

force_download <- FALSE # retélécharger même si le fichier existe et a été téléchargé aujourd'hui ?


# téléchargement ------------------------------------------------------

if (!dir_exists("donnees")) dir_create("donnees")
if (!dir_exists("resultats")) dir_create("resultats")
if (!dir_exists("resultats/animation_spf_hex")) dir_create("resultats/animation_spf_hex")

if (!file_exists(fichier_covid) |
    file_info(fichier_covid)$modification_time < Sys.Date() |
    force_download) {
  GET(url_donnees_covid,
      progress(),
      write_disk(fichier_covid, overwrite = TRUE)) %>%
    stop_for_status()
}

if (!file_exists(fichier_pop)) {
  GET(url_donnees_pop,
      progress(),
      write_disk(fichier_pop)) %>%
    stop_for_status()
}


# données -----------------------------------------------------------------

covid <- read_csv2(fichier_covid)

# adminexpress prétéléchargé
dep <- read_sf(path(aex, "ADE_2-2_SHP_LAMB93_FR/DEPARTEMENT.shp")) %>%
  clean_names() %>%
  mutate(surf_ha = st_area(geometry) * 10000) %>%
  st_set_crs(2154)

# grille hexagonale
dep_cells_hex <- calculate_grid(shape = dep, grid_type = "hexagonal", seed = 3)
dep_hex <- assign_polygons(dep, dep_cells_hex) %>%
  st_set_crs(2154)

# Pour les DOM on duplique et déplace un département existant
d971 <- dep_hex[dep_hex$insee_dep == "29", ]
d971$geometry[[1]] <- d971$geometry[[1]] + st_point(c(0, -150000))
d971$insee_dep <- "971"

d972 <- dep_hex[dep_hex$insee_dep == "29", ]
d972$geometry[[1]] <- d972$geometry[[1]] + st_point(c(0, -250000))
d972$insee_dep <- "972"

d973 <- dep_hex[dep_hex$insee_dep == "29", ]
d973$geometry[[1]] <- d973$geometry[[1]] + st_point(c(0, -350000))
d973$insee_dep <- "973"

d974 <- dep_hex[dep_hex$insee_dep == "2A", ]
d974$geometry[[1]] <- d974$geometry[[1]] + st_point(c(0, 250000))
d974$insee_dep <- "974"

d976 <- dep_hex[dep_hex$insee_dep == "2A", ]
d976$geometry[[1]] <- d976$geometry[[1]] + st_point(c(0, 350000))
d976$insee_dep <- "976"

dep_hex <- rbind(dep_hex, d971, d972, d973, d974, d976)

# population
pop <- read_xls(fichier_pop, skip = 2) %>%
  clean_names()

# lignes de séparation DOM / métropole
encarts <- st_multilinestring(
  list(st_linestring(matrix(c(1100000, 6500000,
                              1100000, 6257000,
                              1240000, 6257000), byrow = TRUE, nrow = 3)),
       st_linestring(matrix(c(230000, 6692000,
                              230000, 6391000), byrow = TRUE, nrow = 2)))) %>%
  st_sfc() %>%
  st_sf(id = 1, geometry = .) %>%
  st_set_crs(2154)

# traitement --------------------------------------------------------------

# jointures des données
creer_df <- function(territoire, date = NULL) {
  territoire %>%
    left_join(pop, by = c("insee_dep" = "x1")) %>%
    left_join(
      covid %>%
        filter(jour == if_else(is.null(date), max(jour), date),
               sexe == 0) %>%
        rename(deces = dc,
               reanim = rea,
               hospit = hosp),
      by = c("insee_dep" = "dep")) %>%
    mutate(incidence = deces / x2020_p * 100000)
}

incidence <- creer_df(dep_hex)

set.seed(1234)
classes <- classIntervals(incidence$incidence, n = 6, style = "kmeans", dataPrecision = 0)$brks

# carto -------------------------------------------------------------------
# décès cate du dernier jour dispo

carte <- tm_layout(title = glue("COVID-19\nFrance\n{max(covid$jour)}"),
                         legend.position = c("left", "bottom"),
                         frame = FALSE) +
  tm_shape(incidence) +
  tm_polygons(col = "incidence", title = "décés\ncumulés pour\n100 000 hab.",
              breaks = classes,
              palette = "viridis",
              legend.reverse = TRUE,
              legend.format = list(text.separator = "à moins de",
                                   digits = 0)) +
  tm_text("insee_dep", size = .8) +
  tm_shape(encarts) +
  tm_lines(lty = 3) +
  tm_credits(glue("http://r.iresmi.net/
                    classif. kmeans
                    données départementales Santé Publique France,
                    INSEE RP 2020, d'après IGN Adminexpress 2020"),
             position = c(.6, 0),
             size = .5)

fichier_carto <- glue("resultats/covid_hex_fr_{max(covid$jour)}.png")

tmap_save(carte, fichier_carto, width = 900, height = 900, scale = .4)


# animation ---------------------------------------------------------------

image_animation <- function(date) {
  message(glue("\n\n{date}\n==========\n"))

  m <- creer_df(dep_hex, date) %>%
    tm_shape() +
    tm_polygons(col = "incidence", title = "décés\ncumulés pour\n100 000 hab.",
                breaks = classes,
                palette = "viridis",
                legend.reverse = TRUE,
                legend.format = list(text.separator = "à moins de",
                                     digits = 0)) +
    tm_text("insee_dep", size = .8) +
    tm_shape(encarts) +
    tm_lines(lty = 3) +
    tm_layout(title = glue("COVID-19\nFrance\n{date}"),
              legend.position = c("left", "bottom"),
              frame = FALSE) +
    tm_credits(glue("http://r.iresmi.net/
                    classif. kmeans
                    données départementales Santé Publique France,
                    INSEE RP 2020, d'après IGN Adminexpress 2020"),
               position = c(.6, 0),
               size = .5)

  tmap_save(m, glue("resultats/animation_spf_hex/covid_fr_{date}.png"),
            width = 800, height = 800, scale = .4,)
}

unique(covid$jour) %>%
  walk(image_animation)

animation <- glue("resultats/deces_covid19_fr_hex_spf_{max(covid$jour)}.gif")

dir_ls("resultats/animation_spf_hex") %>%
  map(image_read) %>%
  image_join() %>%
  image_animate(fps = 2, optimize = TRUE) %>%
  image_write(animation)

COVID decease

The global shape and relations are well rendered. Deformations are quite important for the small départements around Paris, but the map remains legible.

Shift
Catégories
Non classé

Europe COVID-19 death map

COVID-19 deaths in Europe
# Europe COVID-19 deaths animated map
# http://r.iresmi.net/
# data European Centre for Disease Prevention and Control


# packages ----------------------------------------------------------------
library(tidyverse)
library(httr)
library(fs)
library(sf)
library(readxl)
library(janitor)
library(glue)
library(tmap)
library(grid)
library(classInt)
library(magick)
# + btb, raster, fasterize, plyr


# sources -----------------------------------------------------------------

# https://data.europa.eu/euodp/en/data/dataset/covid-19-coronavirus-data
covid_file <- "covid_eu.csv"
covid_url <- "https://opendata.ecdc.europa.eu/covid19/casedistribution/csv"

countries_file <- "ne_50m_admin_0_countries.shp"
countries_url <- "https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/50m/cultural/ne_50m_admin_0_countries.zip"


# config ------------------------------------------------------------------

radius <- 600000 # smoothing radius (m)
pixel <- 100000 # grid resolution (m)

force_download <- FALSE # download even if already downloaded today ?

#' Kernel weighted smoothing with arbitrary bounding area
#'
#' @param df sf object (points)
#' @param field weight field in the df
#' @param bandwidth kernel bandwidth (map units)
#' @param resolution output grid resolution (map units)
#' @param zone sf study zone (polygon)
#' @param out_crs EPSG (should be an equal-area projection)
#'
#' @return a raster object
#' @import btb, raster, fasterize, dplyr, plyr, sf
lissage <- function(df, field, bandwidth, resolution, zone, out_crs = 3035) {
  if (st_crs(zone)$epsg != out_crs) {
    message("reprojecting data...")
    zone <- st_transform(zone, out_crs)
  }

  if (st_crs(df)$epsg != out_crs) {
    message("reprojecting study zone...")
    df <- st_transform(df, out_crs)
  }

  zone_bbox <- st_bbox(zone)

  # grid generation
  message("generating reference grid...")
  zone_xy <- zone %>%
    dplyr::select(geometry) %>%
    st_make_grid(
      cellsize = resolution,
      offset = c(plyr::round_any(zone_bbox[1] - bandwidth, resolution, f = floor),
                 plyr::round_any(zone_bbox[2] - bandwidth, resolution, f = floor)),
      what = "centers") %>%
    st_sf() %>%
    st_join(zone, join = st_intersects, left = FALSE) %>%
    st_coordinates() %>%
    as_tibble() %>%
    dplyr::select(x = X, y = Y)

  # kernel
  message("computing kernel...")
  kernel <- df %>%
    cbind(., st_coordinates(.)) %>%
    st_set_geometry(NULL) %>%
    dplyr::select(x = X, y = Y, field) %>%
    btb::kernelSmoothing(
      dfObservations = .,
      sEPSG = out_crs,
      iCellSize = resolution,
      iBandwidth = bandwidth,
      vQuantiles = NULL,
      dfCentroids = zone_xy
    )

  # rasterization
  message("\nrasterizing...")
  raster::raster(
    xmn = plyr::round_any(zone_bbox[1] - bandwidth, resolution, f = floor),
    ymn = plyr::round_any(zone_bbox[2] - bandwidth, resolution, f = floor),
    xmx = plyr::round_any(zone_bbox[3] + bandwidth, resolution, f = ceiling),
    ymx = plyr::round_any(zone_bbox[4] + bandwidth, resolution, f = ceiling),
    resolution = resolution
  ) %>%
    fasterize::fasterize(kernel, ., field = field)
}


# download data ------------------------------------------------------------

if (!dir_exists("data")) dir_create("data")
if (!dir_exists("results")) dir_create("results")
if (!dir_exists("results/animation_eu")) dir_create("results/animation_eu")

if (!file_exists(path("data", covid_file)) |
    file_info(path("data", covid_file))$modification_time < Sys.Date() |
    force_download) {
  GET(covid_url,
      progress(),
      write_disk(path("data", covid_file), overwrite = TRUE)) %>%
    stop_for_status()
}

if (!file_exists(path("data", countries_file))) {
  dl <- file_temp()

  GET(countries_url,
      progress(),
      write_disk(dl)) %>%
    stop_for_status()

  unzip(dl, exdir = "data")
}


# data --------------------------------------------------------------------

# some countries doesn't have data for the first or latest days ; we fill with latest
# data
covid <- read_csv(path("data", covid_file),
                  col_types = cols(dateRep = col_date(format = "%d/%m/%Y")),
                  na = c("N/A", "")) %>%
  clean_names() %>%
  complete(geo_id, date_rep) %>%
  replace_na(list(deaths = 0)) %>%
  group_by(geo_id) %>%
  arrange(date_rep) %>%
  mutate(deaths_cum = cumsum(deaths)) %>%
  fill(countryterritory_code, countries_and_territories, pop_data2018, continent_exp, .direction = "up") %>%
  ungroup() %>%
  select(-c(day, month, year, cases))

# keep only european countries minus Russia and adding TUR and CYP
# remove overseas territories, reproject in LAEA
countries <- read_sf(path("data", countries_file)) %>%
  clean_names() %>%
  filter(continent == "Europe" & iso_a3_eh != "RUS" | iso_a3_eh %in% c("TUR", "CYP")) %>%
  st_cast("POLYGON") %>%
  st_set_crs(4326) %>%
  st_join(c(xmin = -20, xmax = 35, ymin = 35, ymax = 70) %>%
            st_bbox() %>%
            st_as_sfc() %>%
            st_as_sf() %>%
            st_set_crs(4326),
          left = FALSE) %>%
  group_by(iso_a3_eh) %>%
  summarise(geometry = st_combine(geometry)) %>%
  st_transform(3035)

# pretreatment -----------------------------------------------------------


# mask to generate grid : union all countries
unioned_countries_file <- "data/eu.rds"

if (!file_exists(unioned_countries_file)) {
  unioned_countries <- countries %>%
    st_union() %>%
    st_sf() %>%
    write_rds(unioned_countries_file)
} else {
  unioned_countries <- read_rds(unioned_countries_file)
}

# join countries/data for a specific date
create_df <- function(territory, date = NULL) {
  covid %>%
    filter(date_rep == if_else(is.null(date), max(date_rep), date)) %>%
    right_join(countries,
              by = c("countryterritory_code" = "iso_a3_eh")) %>%
    st_as_sf() %>%
    st_point_on_surface() %>% 
    drop_na(deaths_cum) %>% 
    st_as_sf()
}

covid_geo <- create_df(countries)


# smoothing for last date ---------------------------------------------------

# deaths
d <- covid_geo %>%
  lissage("deaths_cum", radius, pixel, unioned_countries)

# population 
p <- covid_geo %>%
  lissage("pop_data2018", radius, pixel, unioned_countries)

# grid per 100000 inhab
death_pop <- d * 100000 / p


# carto -------------------------------------------------------------------

# classification for last date to be reused in animation
set.seed(1234)
classes <- classIntervals(raster::values(death_pop), n = 6, style = "kmeans", dataPrecision = 0)$brks


# animation ---------------------------------------------------------------

image_animation <- function(date) {
  message(glue("\n\n{date}\n==========\n"))

  m <- create_df(countries, date) %>%
    lissage("deaths_cum", radius, pixel, unioned_countries) %>%
    magrittr::divide_by(p) %>%
    magrittr::multiply_by(100000) %>%
    tm_shape() +
    tm_raster(title = glue("deaths
                         per 100 000 inhab."),
              style = "fixed",
              breaks = classes,
              palette = "viridis",
              legend.format = list(text.separator = "to less than",
                                   digits = 0),
              legend.reverse = TRUE) +
    tm_layout(title = glue("COVID-19 - Europe\ncumulative as of {date}"),
              legend.position = c("right", "top"),
              frame = FALSE) +
    #tm_shape(countries, bbox = death_pop) +
    #tm_borders() +
    tm_credits(glue("http://r.iresmi.net/
                  bisquare kernel smoothing {radius / 1000} km on {pixel / 1000} km grid
                  classif. kmeans, LAEA Europe projection
                  data European Centre for Disease Prevention and Control / map Naturalearth"),
               size = .5,
               position = c(.5, .025))
  
  message("saving map...")
  tmap_save(m, glue("results/animation_eu/covid_eu_{date}.png"),
            width = 800, height = 800, scale = .4,)
}

covid %>% 
  filter(date_rep >= "2020-03-15") %>% 
  pull(date_rep) %>% 
  unique() %>%
  walk(image_animation)

animation <- glue("results/deaths_covid19_eu_{max(covid$date_rep)}.gif")

dir_ls("results/animation_eu") %>%
  map(image_read) %>%
  image_join() %>%
  #image_scale("500x500") %>%
  image_morph(frames = 1) %>%
  image_animate(fps = 2, optimize = TRUE) %>%
  image_write(animation)
Catégories
Non classé

COVID-19 decease animation map

Coronavirus decease in France
# Animation carto décès COVID 19 France
# avec lissage

# packages -----------------------------------------------------------------
library(tidyverse)
library(httr)
library(fs)
library(sf)
library(readxl)
library(janitor)
library(glue)
library(tmap)
library(grid)
library(classInt)
library(magick)
# + btb, raster, fasterize, plyr

# sources -----------------------------------------------------------------

# https://www.data.gouv.fr/fr/datasets/donnees-hospitalieres-relatives-a-lepidemie-de-covid-19/
fichier_covid <- "donnees/covid.csv"
url_donnees_covid <- "https://www.data.gouv.fr/fr/datasets/r/63352e38-d353-4b54-bfd1-f1b3ee1cabd7"

# https://www.insee.fr/fr/statistiques/2012713#tableau-TCRD_004_tab1_departements
fichier_pop <- "donnees/pop.xls"
url_donnees_pop <- "https://www.insee.fr/fr/statistiques/fichier/2012713/TCRD_004.xls"

# Adminexpress : à télécharger manuellement
# https://geoservices.ign.fr/documentation/diffusion/telechargement-donnees-libres.html#admin-express
aex <- path_expand("~/Downloads/ADMIN-EXPRESS_2-2__SHP__FRA_2020-02-24/ADMIN-EXPRESS/1_DONNEES_LIVRAISON_2020-02-24")

# config ------------------------------------------------------------------

rayon <- 100000 # distance de lissage (m)
pixel <- 10000 # résolution grille (m)

force_download <- FALSE # retélécharger même si le fichier existe et a été téléchargé aujourd'hui ?

#' Kernel weighted smoothing with arbitrary bounding area
#'
#' @param df sf object (points)
#' @param field weight field in the df
#' @param bandwidth kernel bandwidth (map units)
#' @param resolution output grid resolution (map units)
#' @param zone sf study zone (polygon)
#' @param out_crs EPSG (should be an equal-area projection)
#'
#' @return a raster object
#' @import btb, raster, fasterize, dplyr, plyr, sf
lissage <- function(df, field, bandwidth, resolution, zone, out_crs = 3035) {
  if (st_crs(zone)$epsg != out_crs) {
    message("reprojecting data...")
    zone <- st_transform(zone, out_crs)
  }
  
  if (st_crs(df)$epsg != out_crs) {
    message("reprojecting study zone...")
    df <- st_transform(df, out_crs)
  }
  
  zone_bbox <- st_bbox(zone)
  
  # grid generation
  message("generating reference grid...")
  zone_xy <- zone %>%
    dplyr::select(geometry) %>%
    st_make_grid(
      cellsize = resolution,
      offset = c(
        plyr::round_any(zone_bbox[1] - bandwidth, resolution, f = floor),
        plyr::round_any(zone_bbox[2] - bandwidth, resolution, f = floor)
      ),
      what = "centers"
    ) %>%
    st_sf() %>%
    st_join(zone, join = st_intersects, left = FALSE) %>%
    st_coordinates() %>%
    as_tibble() %>%
    dplyr::select(x = X, y = Y)
  
  # kernel
  message("computing kernel...")
  kernel <- df %>%
    cbind(., st_coordinates(.)) %>%
    st_set_geometry(NULL) %>%
    dplyr::select(x = X, y = Y, field) %>%
    btb::kernelSmoothing(
      dfObservations = .,
      sEPSG = out_crs,
      iCellSize = resolution,
      iBandwidth = bandwidth,
      vQuantiles = NULL,
      dfCentroids = zone_xy
    )
  
  # rasterization
  message("\nrasterizing...")
  raster::raster(
    xmn = plyr::round_any(zone_bbox[1] - bandwidth, resolution, f = floor),
    ymn = plyr::round_any(zone_bbox[2] - bandwidth, resolution, f = floor),
    xmx = plyr::round_any(zone_bbox[3] + bandwidth, resolution, f = ceiling),
    ymx = plyr::round_any(zone_bbox[4] + bandwidth, resolution, f = ceiling),
    resolution = resolution
  ) %>%
    fasterize::fasterize(kernel, ., field = field)
}


# téléchargement--------------------------------------------------------------

if (!dir_exists("donnees")) dir_create("donnees")
if (!dir_exists("resultats")) dir_create("resultats")
if (!dir_exists("resultats/animation")) dir_create("resultats/animation")

if (!file_exists(fichier_covid) |
    file_info(fichier_covid)$modification_time < Sys.Date() |
    force_download) {
  GET(url_donnees_covid,
      progress(),
      write_disk(fichier_covid, overwrite = TRUE))
}

if (!file_exists(fichier_pop)) {
  GET(url_donnees_pop,
      progress(),
      write_disk(fichier_pop))
}


# données -----------------------------------------------------------------

covid <- read_csv2(fichier_covid)

# adminexpress prétéléchargé
dep <- read_sf(path(aex, "ADE_2-2_SHP_LAMB93_FR/DEPARTEMENT.shp")) %>%
  clean_names() %>%
  st_set_crs(2154)

pop <- read_xls(fichier_pop, skip = 2) %>%
  clean_names()


# prétraitement -----------------------------------------------------------

# contour métropole pour grille de référence
fichier_fr <- "donnees/fr.rds"

if (!file_exists(fichier_fr)) {
  fr <- dep %>%
    st_union() %>%
    st_sf() %>%
    write_rds(fichier_fr)
} else {
  fr <- read_rds(fichier_fr)
}

# jointures des données
creer_df <- function(territoire, date = NULL) {
  territoire %>%
    left_join(pop, by = c("insee_dep" = "x1")) %>%
    left_join(
      covid %>%
        filter(jour == if_else(is.null(date), max(jour), date),
               sexe == 0) %>%
               rename(deces = dc,
                      reanim = rea,
                      hospit = hosp),
      by = c("insee_dep" = "dep")) %>%
    st_point_on_surface()
}

covid_geo_pop <- creer_df(dep)


# lissage -----------------------------------------------------------------
# génération de la dernière grille mortalité
# et création des grilles pour 100000 habitants

# décès métropole 
d <- covid_geo_pop %>%
  lissage("deces", rayon, pixel, fr)


# population métropole et DOM
p <- covid_geo_pop %>%
  lissage("x2020_p", rayon, pixel, fr)

# grilles pour 100000 hab
d100k <- d * 100000 / p


# classification à réutiliser pour les autres cartes
set.seed(1234)
classes <- classIntervals(raster::values(d100k), n = 6, style = "kmeans", dataPrecision = 0)$brks


# animation ---------------------------------------------------------------

image_animation <- function(date) {
  m <- creer_df(dep, date) %>%
    lissage("deces", rayon, pixel, fr) %>%
    magrittr::divide_by(p) %>%
    magrittr::multiply_by(100000) %>%
    tm_shape() +
    tm_raster(title = glue("décès à l'hôpital
                         pour 100 000 hab."),
              style = "fixed",
              breaks = classes,
              palette = "viridis",
              legend.format = list(text.separator = "à moins de",
                                   digits = 0),
              legend.reverse = TRUE) +
    tm_shape(dep) +
    tm_borders() +
    tm_layout(title = glue("COVID-19 - France métropolitaine - cumul au {date}"),
              legend.position = c("left", "bottom"),
              frame = FALSE) +
    tm_credits(glue("http://r.iresmi.net/
                  lissage noyau bisquare {rayon / 1000} km sur grille {pixel / 1000} km
                  classif. kmeans
                  projection LAEA Europe
                  données départementales Santé publique France,
                  INSEE RP 2020, IGN Adminexpress 2020"),
               size = .5,
               position = c(.5, .025))
  
  tmap_save(m, glue("resultats/animation/covid_fr_{date}.png"),
            width = 800, height = 800, scale = .4,)
}

unique(covid$jour) %>%
  walk(image_animation)

animation <- glue("resultats/deces_covid19_fr_{max(covid$jour)}.gif")

dir_ls("resultats/animation") %>%
  map(image_read) %>%
  image_join() %>%
  #image_scale("500x500") %>%
  image_morph(frames = 1) %>%
  image_animate(fps = 2, optimize = TRUE) %>%
  image_write(animation)


Catégories
Non classé

Coronavirus : spatially smoothed decease in France

Coronavirus decease in France

See also the animated map.

From the official data by Santé Publique France, we spatially smooth the decease (produced by SPF at the département scale) and normalize by a similarly smoothed population grid. For that we use the {btb} package.

# Carto décès COVID 19 France
# avec lissage


# sources -----------------------------------------------------------------

# https://www.data.gouv.fr/fr/datasets/donnees-hospitalieres-relatives-a-lepidemie-de-covid-19/
fichier_covid <- "donnees/covid.csv"
url_donnees_covid <- "https://www.data.gouv.fr/fr/datasets/r/63352e38-d353-4b54-bfd1-f1b3ee1cabd7"

# https://www.insee.fr/fr/statistiques/2012713#tableau-TCRD_004_tab1_departements
fichier_pop <- "donnees/pop.xls"
url_donnees_pop <- "https://www.insee.fr/fr/statistiques/fichier/2012713/TCRD_004.xls"

# Adminexpress : à télécharger manuellement
# https://geoservices.ign.fr/documentation/diffusion/telechargement-donnees-libres.html#admin-express
#aex <- "donnees/1_DONNEES_LIVRAISON_2019-03-14/"
aex <- path_expand("~/Downloads/ADMIN-EXPRESS_2-2__SHP__FRA_2020-02-24/ADMIN-EXPRESS/1_DONNEES_LIVRAISON_2020-02-24")

# config ------------------------------------------------------------------
library(tidyverse)
library(httr)
library(fs)
library(sf)
library(readxl)
library(janitor)
library(glue)
library(tmap)
library(grid)
library(classInt)
# + btb, raster, fasterize, plyr

rayon <- 100000 # distance de lissage (m)
pixel <- 10000 # résolution grille (m)

force_download <- TRUE # retélécharger même si le fichier existe et a été téléchargé aujourd'hui ?

#' Kernel weighted smoothing with arbitrary bounding area
#'
#' @param df sf object (points)
#' @param field weight field in the df
#' @param bandwidth kernel bandwidth (map units)
#' @param resolution output grid resolution (map units)
#' @param zone sf study zone (polygon)
#' @param out_crs EPSG (should be an equal-area projection)
#'
#' @return a raster object
#' @import btb, raster, fasterize, dplyr, plyr, sf
lissage <- function(df, field, bandwidth, resolution, zone, out_crs = 3035) {
    if (st_crs(zone)$epsg != out_crs) {
      message("reprojecting data...")
      zone <- st_transform(zone, out_crs)
    }

    if (st_crs(df)$epsg != out_crs) {
      message("reprojecting study zone...")
      df <- st_transform(df, out_crs)
    }

    zone_bbox <- st_bbox(zone)

    # grid generation
    message("generating reference grid...")
    zone_xy <- zone %>%
      dplyr::select(geometry) %>%
      st_make_grid(
        cellsize = resolution,
        offset = c(
          plyr::round_any(zone_bbox[1] - bandwidth, resolution, f = floor),
          plyr::round_any(zone_bbox[2] - bandwidth, resolution, f = floor)
        ),
        what = "centers"
      ) %>%
      st_sf() %>%
      st_join(zone, join = st_intersects, left = FALSE) %>%
      st_coordinates() %>%
      as_tibble() %>%
      dplyr::select(x = X, y = Y)

    # kernel
    message("computing kernel...")
    kernel <- df %>%
      cbind(., st_coordinates(.)) %>%
      st_set_geometry(NULL) %>%
      dplyr::select(x = X, y = Y, field) %>%
      btb::kernelSmoothing(
        dfObservations = .,
        sEPSG = out_crs,
        iCellSize = resolution,
        iBandwidth = bandwidth,
        vQuantiles = NULL,
        dfCentroids = zone_xy
      )

    # rasterization
    message("\nrasterizing...")
    raster::raster(
      xmn = plyr::round_any(zone_bbox[1] - bandwidth, resolution, f = floor),
      ymn = plyr::round_any(zone_bbox[2] - bandwidth, resolution, f = floor),
      xmx = plyr::round_any(zone_bbox[3] + bandwidth, resolution, f = ceiling),
      ymx = plyr::round_any(zone_bbox[4] + bandwidth, resolution, f = ceiling),
      resolution = resolution
    ) %>%
      fasterize::fasterize(kernel, ., field = field)
  }


# téléchargement--------------------------------------------------------------
if (!file_exists(fichier_covid) |
    file_info(fichier_covid)$modification_time < Sys.Date() |
    force_download) {
  GET(url_donnees_covid,
      progress(),
      write_disk(fichier_covid, overwrite = TRUE))
}

if (!file_exists(fichier_pop)) {
  GET(url_donnees_pop,
      progress(),
      write_disk(fichier_pop))
}


# données -----------------------------------------------------------------

covid <- read_csv2(fichier_covid)

# adminexpress prétéléchargé
dep <- read_sf(path(aex, "ADE_2-2_SHP_LAMB93_FR/DEPARTEMENT.shp")) %>%
  clean_names() %>%
  st_set_crs(2154)

dep_971 <- read_sf(path(aex, "ADE_2-2_SHP_RGAF09UTM20_D971/DEPARTEMENT.shp")) %>%
  clean_names() %>%
  st_set_crs(5490)

dep_972 <- read_sf(path(aex, "ADE_2-2_SHP_RGAF09UTM20_D972/DEPARTEMENT.shp")) %>%
  clean_names() %>%
  st_set_crs(5490)

dep_973 <- read_sf(path(aex, "ADE_2-2_SHP_UTM22RGFG95_D973/DEPARTEMENT.shp")) %>%
  clean_names() %>%
  st_set_crs(2972)

dep_974 <- read_sf(path(aex, "ADE_2-2_SHP_RGR92UTM40S_D974/DEPARTEMENT.shp")) %>%
  clean_names() %>%
  st_set_crs(2975)

dep_976 <- read_sf(path(aex, "ADE_2-2_SHP_RGM04UTM38S_D976/DEPARTEMENT.shp")) %>%
  clean_names() %>%
  st_set_crs(4471)

pop <- read_xls(fichier_pop, skip = 2) %>%
  clean_names()


# prétraitement -----------------------------------------------------------

# contour métropole
fr <- dep %>%
  st_union() %>%
  st_sf()

# jointures des données
creer_df <- function(territoire) {
  territoire %>%
    left_join(pop, by = c("insee_dep" = "x1")) %>%
    left_join(
      covid %>%
        filter(jour == max(jour),
               sexe == 0) %>%
        group_by(dep) %>%
        summarise(deces = sum(dc, na.rm = TRUE),
                  reanim = sum(rea, na.rm = TRUE),
                  hospit = sum(hosp, na.rm = TRUE)),
      by = c("insee_dep" = "dep")) %>%
    st_point_on_surface()
}

covid_geo_pop     <- creer_df(dep)
covid_geo_pop_971 <- creer_df(dep_971)
covid_geo_pop_972 <- creer_df(dep_972)
covid_geo_pop_973 <- creer_df(dep_973)
covid_geo_pop_974 <- creer_df(dep_974)
covid_geo_pop_976 <- creer_df(dep_976)

# lissage -----------------------------------------------------------------
# génération des grilles mortalité, hospitalisation et réanimation et population
# et création des grilles pour 100000 habitants

# décès métropole et DOM
d <- covid_geo_pop %>%
  lissage("deces", rayon, pixel, fr)

d_971 <- covid_geo_pop_971 %>%
  lissage("deces", rayon, pixel, dep_971, 5490)

d_972 <- covid_geo_pop_972 %>%
  lissage("deces", rayon, pixel, dep_972, 5490)

d_973 <- covid_geo_pop_973 %>%
  lissage("deces", rayon, pixel, dep_973, 2972)

d_974 <- covid_geo_pop_974 %>%
  lissage("deces", rayon, pixel, dep_974, 2975)

d_976 <- covid_geo_pop_976 %>%
  lissage("deces", rayon, pixel, dep_976, 4471)

# population métropole et DOM
p <- covid_geo_pop %>%
  lissage("x2020_p", rayon, pixel, fr)

p_971 <- covid_geo_pop_971 %>%
  lissage("x2020_p", rayon, pixel, dep_971, 5490)

p_972 <- covid_geo_pop_972 %>%
  lissage("x2020_p", rayon, pixel, dep_972, 5490)

p_973 <- covid_geo_pop_973 %>%
  lissage("x2020_p", rayon, pixel, dep_973, 2972)

p_974 <- covid_geo_pop_974 %>%
  lissage("x2020_p", rayon, pixel, dep_974, 2975)

p_976 <- covid_geo_pop_976 %>%
  lissage("x2020_p", rayon, pixel, dep_976, 4471)

# grilles pour 100000 hab
d100k <- d * 100000 / p
d100k_971 <- d_971 * 100000 / p_971
d100k_972 <- d_972 * 100000 / p_972
d100k_973 <- d_973 * 100000 / p_973
d100k_974 <- d_974 * 100000 / p_974
d100k_976 <- d_976 * 100000 / p_976

# réanimation et hospitalisation métropole uniquement
r <- covid_geo_pop %>%
  lissage("reanim", rayon, pixel, fr)
r100k <- r * 100000 / p

h <- covid_geo_pop %>%
  lissage("hospit", rayon, pixel, fr)
h100k <- h * 100000 / p


# carto -------------------------------------------------------------------

# décès métropole et DOM

# classification à réutiliser pour les 6 cartes
set.seed(1234)
classes <- classIntervals(raster::values(d100k), n = 5, style = "kmeans", dataPrecision = 0)$brks

# métro et DOM
(carte_d <- tm_layout(title = paste("COVID-19 - France - cumul au", max(covid$jour)),
                     legend.position = c("left", "bottom"),
                     frame = FALSE) +
  tm_shape(d100k) +
  tm_raster(title = glue("décès à l'hôpital
                         pour 100 000 hab."),
            style = "fixed",
            breaks = classes,
            palette = "viridis",
            legend.format = list(text.separator = "à moins de",
                                 digits = 0),
            legend.reverse = TRUE) +
  tm_shape(dep) +
  tm_borders() +
  tm_credits(glue("http://r.iresmi.net/
                  lissage noyau bisquare {rayon / 1000} km sur grille {pixel / 1000} km
                  classif. kmeans
                  projections LAEA Europe (métropole) et locales (DOM)
                  données départementales Santé publique France,
                  INSEE RP 2020, IGN Adminexpress 2020"),
             size = .5,
             position = c(.5, .025))
)

tm_971 <- tm_shape(d100k_971, ext = 0.7) +
  tm_raster(style = "fixed",
            breaks = classes,
            palette = "viridis",
            legend.show = FALSE) +
  tm_shape(dep_971) +
  tm_borders() +
  tm_layout(frame = FALSE,
            bg.color = NA)

tm_972 <- tm_shape(d100k_972, ext = 0.7) +
  tm_raster(style = "fixed",
            breaks = classes,
            palette = "viridis",
            legend.show = FALSE) +
  tm_shape(dep_972) +
  tm_borders() +
  tm_layout(frame = FALSE,
            bg.color = NA)

tm_973 <- tm_shape(d100k_973) +
  tm_raster(style = "fixed",
            breaks = classes,
            palette = "viridis",
            legend.show = FALSE) +
  tm_shape(dep_973) +
  tm_borders() +
  tm_layout(frame = FALSE,
            bg.color = NA)

tm_974 <- tm_shape(d100k_974, ext = 0.75) +
  tm_raster(style = "fixed",
            breaks = classes,
            palette = "viridis",
            legend.show = FALSE) +
  tm_shape(dep_974) +
  tm_borders()+
  tm_layout(frame = FALSE,
            bg.color = NA)

tm_976 <- tm_shape(d100k_976, ext = 0.6) +
  tm_raster(style = "fixed",
            breaks = classes,
            palette = "viridis",
            legend.show = FALSE) +
  tm_shape(dep_976) +
  tm_borders()+
  tm_layout(frame = FALSE,
            bg.color = NA)

# assemblage
fichier_carto <- glue("resultats/covid_fr_{max(covid$jour)}.png")

tmap_save(carte_d, fichier_carto, width = 900, height = 900, scale = .4,
          insets_tm = list(tm_971, tm_972, tm_973, tm_974, tm_976),
          insets_vp = list(viewport(x = .1, y = .65, width = .15, height = .15),
                           viewport(x = .1, y = .58, width = .15, height = .15),
                           viewport(x = .15, y = .4, width = .35, height = .45),
                           viewport(x = .9, y = .4, width = .15, height = .15),
                           viewport(x = .9, y = .5, width = .15, height = .15)))

Catégories
Non classé

Mauna Loa CO₂ polar plot

After a classic plot of the Keeling curve (see our former post) used on Wikipedia, we can explore another data visualization. The CO₂ atmospheric concentration, the main cause of the climate warming, is following a seasonal cycle so it could be interesting (or ironic ?) to use a polar plot.

Config and data

We only keep two translations for brevity here…

# Required packages
library(tidyverse)
library(scales)
library(lubridate)

# Translations ------------------------------------------------------------

language <- list(
  en_US = list(
    locale_lc_time = "en_US.UTF-8",
    title = bquote("Monthly mean"~CO[2]~"concentration"),
    caption = paste("Data : P. Tans, NOAA/ESRL (www.esrl.noaa.gov/gmd/ccgg/trends/)\nand R. Keeling, Scripps Institution of Oceanography (scrippsco2.ucsd.edu/). Accessed", Sys.Date()),
    x = "Year",
    y = bquote(CO[2]~"fraction in dry air ("*mu*"mol/mol)"),
    x2 = "Month",
    y2 = bquote(atop(CO[2]~"fraction in dry air ("*mu*"mol/mol)", "Departure from yearly average")),
    title2 = "Seasonal variation"
  ),
  fr_FR = list(
    locale_lc_time = "fr_FR.UTF-8",
    title = bquote("Moyenne mensuelle de la concentration de"~CO[2]),
    caption = paste("données : P. Tans, NOAA/ESRL (www.esrl.noaa.gov/gmd/ccgg/trends/)\net R. Keeling, Scripps Institution of Oceanography (scrippsco2.ucsd.edu/). Accédé le", Sys.Date()),
    x = "année",
    y = bquote("fraction de"~CO[2]~"dans l'air sec ("*mu*"mol/mol)"),
    x2 = "mois",
    y2 = bquote(atop("fraction de"~CO[2]~"dans l'air sec ("*mu*"mol/mol)", "en écart à la moyenne annuelle")),
    title2 = "Variation saisonnière"
  ))

# Data --------------------------------------------------------------------

# https://www.esrl.noaa.gov/gmd/ccgg/trends/
co2ml <- read_delim("ftp://aftp.cmdl.noaa.gov/products/trends/co2/co2_mm_mlo.txt",
                    delim = " ",
                    locale = locale(decimal_mark = "."),
                    na = c("-99.99", "-1"),
                    col_types = "iiddddi",
                    col_names = c("year", "month", "decimal", "co2", "co2_interpol", "co2_trend", "days"),
                    comment = "#",
                    trim_ws = TRUE) %>% 
  group_by(year) %>% 
  mutate(year_mean = mean(co2_interpol, na.rm = TRUE),
         delta = co2_interpol - year_mean,
         vdate = ymd(paste0("2015-", month, "-01"))) %>% 
  ungroup()

Create the plot for each language and save

We use a virtual date to keep the data in the same January-December interval and we add a partial dataframe to smooth the Dec./Jan. transition and build the spiral.

# Polar plot
for (l in names(language)) {
  message(l)
  current <- language[[l]]
  
  # format the date in local names
  Sys.setlocale("LC_TIME", current$locale_lc_time)
  
  p3 <- co2ml %>% 
    filter(vdate == "2015-01-01") %>% 
    mutate(vdate = ymd("2015-12-31"),
           year = year -1) %>% 
    bind_rows(co2ml) %>% 
    ggplot(aes(vdate, co2_interpol, group = year, color = year)) +
      geom_line(size = 1.2) +
      scale_x_date(breaks = pretty_breaks(12), labels = date_format("%b")) +
      scale_color_viridis_c() +
      labs(subtitle = current$title,
           x = "",
           y = current$y,
           color = current$x,
           title = paste("Mauna Loa", min(co2ml$year), "-", max(co2ml$year)),
           caption = current$caption) +
      coord_polar() +
      theme_bw() +
      theme(axis.title.y = element_text(hjust = .85),
            panel.grid.major.y  = element_blank(),
            panel.grid.minor.x = element_blank(),
            panel.border = element_blank(),
            plot.caption = element_text(size = 7))
  
  ggsave(p3, file = paste("co2_mauna_loa_polar", l, Sys.Date(), "wp.svg", sep = "_"), width = 20, height = 20, units = "cm", device = svg)
}
60 years of CO₂ increase

Does this new year look good ? Will the spiral cross its path soon ?

Catégories
Non classé

Kernel spatial smoothing : transforming points pattern to continuous coverage

Representing mass data (inhabitants, livestock,…) on a map in not always easy : choropleth maps are clearly a no-go, except if you normalize with area and then you stumble on the MAUP… A nice solution is smoothing, producing a raster. You even get freebies like (potential) statistical confidentiality, a better geographic synthesis and easy multiple layers computations.

The kernel smoothing should not be confused with interpolation or kriging : the aim here is to « spread » and sum point values, see Loonis and de Bellefon (2018) for a comprehensive explanation.

We could use the {RSAGA} package which provides a « Kernel Density Estimation » in the « grid_gridding » lib for use with rsaga.geoprocessor(). However, we’ll use the {btb} package (Santos et al. 2018) which has the great advantage of providing a way to specify a geographical study zone, avoiding our values to bleed in another country or in the sea for example.

We’ll map the french population :

  • the data is available on the IGN site
  • a 7z decompress utility must be available in your $PATH ;
  • the shapefile COMMUNE.shp has a POPULATION field ;
  • this is a polygon coverage, so we’ll take the « centroids ».
library(raster)  # load before dplyr (against select conflict)
library(tidyverse)
library(httr)
library(sf)
library(btb)

# download and extract data
zipped_file <- tempfile()
GET("ftp://Admin_Express_ext:Dahnoh0eigheeFok@ftp3.ign.fr/ADMIN-EXPRESS_2-0__SHP__FRA_2019-03-14.7z.001", 
    write_disk(zipped_file),
    progress())

system(paste("7z x -odata", zipped_file))

# create a unique polygon for France (our study zone)
fr <- read_sf("data/ADMIN-EXPRESS_2-0__SHP__FRA_2019-03-14/ADMIN-EXPRESS/1_DONNEES_LIVRAISON_2019-03-14/ADE_2-0_SHP_LAMB93_FR/REGION.shp") %>% 
  st_union() %>%
  st_sf() %>% 
  st_set_crs(2154)

# load communes ; convert to points
comm <- read_sf("data/ADMIN-EXPRESS_2-0__SHP__FRA_2019-03-14/ADMIN-EXPRESS/1_DONNEES_LIVRAISON_2019-03-14/ADE_2-0_SHP_LAMB93_FR/COMMUNE.shp")%>% 
  st_set_crs(2154) %>% 
  st_point_on_surface()
 

We create a function :

#' Kernel weighted smoothing with arbitrary bounding area
#'
#' @param df sf object (points)
#' @param field weight field in the df
#' @param bandwidth kernel bandwidth (map units)
#' @param resolution output grid resolution (map units)
#' @param zone sf study zone (polygon)
#' @param out_crs EPSG (should be an equal-area projection)
#'
#' @return a raster object
#' @import btb, raster, fasterize, dplyr, plyr, sf
lissage <- function(df, field, bandwidth, resolution, zone, out_crs = 3035) {
  
  if (st_crs(zone)$epsg != out_crs) {
    message("reprojecting data...")
    zone <- st_transform(zone, out_crs)
  }
  
  if (st_crs(df)$epsg != out_crs) {
    message("reprojecting study zone...")
    df <- st_transform(df, out_crs)
  }
  
  zone_bbox <- st_bbox(zone)
  
  # grid generation
  message("generating reference grid...")
  zone_xy <- zone %>% 
    dplyr::select(geometry) %>% 
    st_make_grid(cellsize = resolution,
                 offset = c(plyr::round_any(zone_bbox[1] - bandwidth, resolution, f = floor),
                            plyr::round_any(zone_bbox[2] - bandwidth, resolution, f = floor)),
                 what = "centers") %>%
    st_sf() %>% 
    st_join(zone, join = st_intersects, left = FALSE) %>% 
    st_coordinates() %>% 
    as_tibble() %>% 
    dplyr::select(x = X, y = Y)
  
  # kernel
  message("computing kernel...")
  kernel <- df %>% 
    cbind(., st_coordinates(.)) %>%
    st_set_geometry(NULL) %>% 
    dplyr::select(x = X, y = Y, field) %>% 
    btb::kernelSmoothing(dfObservations = .,
                         sEPSG = out_crs,
                         iCellSize = resolution,
                         iBandwidth = bandwidth,
                         vQuantiles = NULL,
                         dfCentroids = zone_xy)
  
  # rasterization
  message("\nrasterizing...")
  raster::raster(xmn = plyr::round_any(zone_bbox[1] - bandwidth, resolution, f = floor),
                 ymn = plyr::round_any(zone_bbox[2] - bandwidth, resolution, f = floor),
                 xmx = plyr::round_any(zone_bbox[3] + bandwidth, resolution, f = ceiling),
                 ymx = plyr::round_any(zone_bbox[4] + bandwidth, resolution, f = ceiling),
                 resolution = resolution) %>% 
    fasterize::fasterize(kernel, ., field = field)
}

Instead of a raw choropleth map like this (don’t do this at home) :

Inhabitants, quantile classification ; see the red arrow : a big commune with a somewhat low population (2100 inhabitants) pops out due to its big area

… we should use a proportional symbol. But it’s quite cluttered in urban areas :

You’ve got measles

We can also get the polygon density :

Density, quantile classification. Our previous commune is now more coherent, however the map is not very synthetic due to the heterogeneous size of the communes

We just have to run our function for example with a bandwidth of 20 km and a cell size of 2 km. We use an equal area projection : the LAEA Europa (EPSG:3035).

comm %>% 
  lissage("POPULATION", 20000, 2000, fr, 3035) %>%
  raster::writeRaster("pop.tif")

And lastly our smoothing :

Smoothing, discrete quantile classification

That’s a nice synthetic representation !

After that it’s easy in R to do raster algebra ; for example dividing a grid of crop yields by a grid of agricultural area, create a percent change between dates, etc.

As a conclusion, a quote :

Behind the aesthetic quality of the smoothed maps, however, lies a major trap. By construction, smoothing methods mitigate breakdowns and borders and induce continuous representation of geographical phenomena. The smoothed maps therefore show the spatial autocorrelation locally.Two points close to the smoothing radius have mechanically comparable characteristics in this type of analysis. As a result, there is little point in drawing conclusions from a smoothed map of geographical phenomena whose spatial scale is of the order of the smoothing radius.

Loonis and de Bellefon (2018)

References

Loonis, Vincent and Marie-Pierre de Bellefon, eds 2018. Handbook of Spatial Analysis. Theory and Application with R. INSEE Méthodes 131. Montrouge, France: Institut national de la statistique et des études économiques. https://www.insee.fr/en/information/3635545.

Santos, Arlindo Dos, Francois Semecurbe, Auriane Renaud, Cynthia Faivre, Thierry Cornely and Farida Marouchi. 2018. btb: Beyond the Border – Kernel Density Estimation for Urban Geography (version 0.1.30). https://CRAN.R-project.org/package=btb.

Catégories
Non classé

Metadata : from PostgreSQL comments to R labels

Metadata are an essential part of a robust data science workflow ; they record the description of the tables and the meaning of each variable : its units, quality, allowed range, how we collect it, when it’s been recorded etc. Data without metadata are practically worthless. Here we will show how to transfer the metadata from PostgreSQL to R.

In PostgreSQL metadata can be stored in comments with the statements COMMENT ON TABLE ... IS '...' or COMMENT ON COLUMN ... IS '...'. So I hope your tables and columns have these nice comments and you can see them in psql or PgAdmin for example. But what about R ?

In R, metadata can be assigned as attributes of any object and mainly as « labels » for the columns. You may have seen labels when importing labelled data from SPSS for example.

We will use the {Hmisc} package which provides functions to manage labels. Another interesting package is {sjlabelled}.

library(Hmisc)
library(tidyverse)
library(RPostgreSQL)
library(glue)
library(httr)
library(rvest)
library(janitor)

cnx <- dbConnect(dbDriver("PostgreSQL"),
                 user = "***",
                 password = "***",
                 host = "***",
                 dbname = "***",
                 port = 5432
)

We’ll start by adding a table and its metadata in PostgreSQL. I chose to use the list of all the french communes from the code officiel géographique (COG). The data is provided as a zipped CSV file ; luckily a data dictionary appears on the page so we’ll scrape it.

# data from https://www.insee.fr/fr/information/3720946

# download
dl <- tempfile()
GET("https://www.insee.fr/fr/statistiques/fichier/3720946/commune2019-csv.zip",
    write_disk(dl))
unzip(dl)

# import in PostgreSQL
read_csv("commune2019.csv",
         col_types = cols(.default = col_character())) %>% 
  clean_names() %>% 
  dbWriteTable(cnx, c("ref_cog", "commune2019"), ., row.names = FALSE)
dbSendQuery(cnx, "ALTER TABLE ref_cog.commune2019 ADD PRIMARY KEY (com, typecom);")

# get the data dictionary from INSEE
page <- GET("https://www.insee.fr/fr/information/3720946") %>% 
  content() 

table_info <- page %>% 
  html_node("#titre-bloc-3 + div > p") %>% 
  html_text() %>% 
  str_trim() %>% 
  str_replace_all("\\s+", " ")

columns_info <- page %>% 
  html_node("table") %>% 
  html_table() %>% 
  clean_names() %>% 
  mutate(designation_et_modalites_de_la_variable = str_trim(designation_et_modalites_de_la_variable),
         designation_et_modalites_de_la_variable = str_replace_all(designation_et_modalites_de_la_variable, "\\s+", " "),
         nom_de_la_variable = make_clean_names(nom_de_la_variable))

# add table metadata in PostgreSQL
dbSendQuery(cnx, glue_sql("COMMENT ON TABLE ref_cog.commune2019 IS {table_info}", .con = cnx))

# add columns metadata in PostgreSQL
walk2(columns_info$nom_de_la_variable,
      columns_info$designation_et_modalites_de_la_variable,
      ~ dbSendQuery(cnx, glue_sql("COMMENT ON COLUMN ref_cog.commune2019.{`.x`} IS {.y}", .con = cnx)))

Now we have this nice table :

=> \dt+ ref_cog.*
                                                      List of relations
 Schema  |    Name     | Type  |  Owner  |  Size   |                Description                                                                                            
---------+-------------+-------+---------+---------+-------------------------------------------------------------------
 ref_cog | commune2019 | table | xxxxxxx | 3936 kB | Liste des communes, arrondissements municipaux, communes déléguées et
                                                     communes associées au 1er janvier 2019, avec le code des niveaux
                                                     supérieurs (canton ou pseudo-canton, département, région)
(1 row)


=> \d+ ref_cog.commune2019
                                                      Table "ref_cog.commune2019"
  Column   | Type | Modifiers | Storage  | Stats target |                         Description                                                           
-----------+------+-----------+----------+--------------+----------------------------------------------------------------
 typecom   | text |           | extended |              | Type de commune
 com       | text |           | extended |              | Code commune
 reg       | text |           | extended |              | Code région
 dep       | text |           | extended |              | Code département
 arr       | text |           | extended |              | Code arrondissement
 tncc      | text |           | extended |              | Type de nom en clair
 ncc       | text |           | extended |              | Nom en clair (majuscules)
 nccenr    | text |           | extended |              | Nom en clair (typographie riche)
 libelle   | text |           | extended |              | Nom en clair (typographie riche) avec article
 can       | text |           | extended |              | Code canton. Pour les communes « multi-cantonales » code décliné 
                                                            de 99 à 90 (pseudo-canton) ou de 89 à 80 (communes nouvelles)
 comparent | text |           | extended |              | Code de la commune parente pour les arrondissements municipaux et
                                                            les communes associées ou déléguées.
Indexes:
    "commune2019_pkey" PRIMARY KEY, btree (com, typecom)

Usually we query the data to R this way :

cog <- dbGetQuery(cnx,
"SELECT
  *
FROM ref_cog.commune2019
LIMIT 10")

We can create a function that will query the metadata of the table in information_schema.columns and add it to the data frame ; the function expects a data frame, the name of the schema.table from which we get the comments and a connection handler. It will return the data frame with labels and an attribute metadata with the description of the table.

#' Add attributes to a dataframe from metadata read in the PostgreSQL database
#'
#' @param df dataframe
#' @param schema_table "schema.table" from which to read the comments
#' @param cnx a database connexion from RPostgreSQL::dbConnect()
#'
#' @return a dataframe with attributes
#'
#' @examples \dontrun{add_metadata(iris, "public.iris", cnx)}
add_metadata <- function(df, schema_table, cnx) {
  
  # get the table description and add it to a data frame attribute called "metadata"
  attr(df, "metadata") <- dbGetQuery(cnx, 
                                     glue_sql("SELECT obj_description({schema_table}::regclass) AS table_description;",
                                              .con = cnx)) %>% 
    pull(table_description)
  
  # get colmumns comments
  meta <- str_match(schema_table, "^(.*)\\.(.*)$") %>% 
    glue_sql(
      "SELECT 
        column_name,    
        pg_catalog.col_description(
          format('%s.%s', isc.table_schema, isc.table_name)::regclass::oid,
                 isc.ordinal_position) AS column_description
      FROM information_schema.columns AS isc
      WHERE isc.table_schema = {s[2]}
        AND isc.table_name = {s[3]};",
      s = .,
      .con = cnx) %>% 
    dbGetQuery(cnx, .)
  
  # match the columns comments to the variables
  label(df, self = FALSE) <- colnames(df) %>% 
    enframe() %>% 
    left_join(meta, by = c("value" = "column_name")) %>% 
    pull(column_description)
  
  df
}

Now we would do :

cog <- dbGetQuery(cnx,
  "SELECT
    *
  FROM ref_cog.commune2019
  LIMIT 10") %>% 
  add_metadata("ref_cog.commune2019", cnx)

The table description is available with :

attr(cog, "metadata")

[1] "Liste des communes, arrondissements municipaux, communes déléguées et communes associées au 1er janvier 2019, avec le code des niveaux supérieurs (canton ou pseudo-canton, département, région)"

And you can see the metadata in the column headings of the RStudio viewer with View(cog) :

… the headings now show the metadata !

We can also use contents(cog) :

Data frame:cog	10 observations and 11 variables    Maximum # NAs:10
                                                                                                                                                   -         Labels                                                              Class     Storage    NAs
typecom   Type de commune                                                     character character   0
com       Code commune                                                        character character   0
reg       Code région                                                         character character   0
dep       Code département                                                    character character   0
arr       Code arrondissement                                                 character character   0
tncc      Type de nom en clair                                                character character   0
ncc       Nom en clair (majuscules)                                           character character   0
nccenr    Nom en clair (typographie riche)                                    character character   0
libelle   Nom en clair (typographie riche) avec article                       character character   0
can       Code canton. Pour les communes « multi-cantonales » code décliné... character character   0
comparent Code de la commune parente pour les arrondissements municipaux...   character character  10

Or :

cog %>% 
  label() %>%
  enframe()

# A tibble: 11 x 2
   name      value                                                                                           
   <chr>     <chr>                                                                                           
 1 typecom   Type de commune                                                                                 
 2 com       Code commune                                                                                    
 3 reg       Code région                                                                                     
 4 dep       Code département                                                                                
 5 arr       Code arrondissement                                                                             
 6 tncc      Type de nom en clair                                                                            
 7 ncc       Nom en clair (majuscules)                                                                       
 8 nccenr    Nom en clair (typographie riche)                                                                
 9 libelle   Nom en clair (typographie riche) avec article                                                   
10 can       Code canton. Pour les communes « multi-cantonales » code décliné de 99 à 90 (pseudo-canton) ou …
11 comparent Code de la commune parente pour les arrondissements municipaux et les communes associées ou dél…

Or lastly, for one column :

label(cog$tncc)

[1] "Type de nom en clair"

We can also search for information in the variable names or in the labels with another function that can be helpful when we have a few hundred columns…

search_var <- function(df, keyword) {
  df %>% 
    label() %>%
    enframe() %>% 
    rename(variable = name,
           metadata = value) %>% 
    filter_all(any_vars(str_detect(., regex(keyword, ignore_case = TRUE))))
}

search_var(cog, "canton")

# A tibble: 1 x 2
  variable metadata                                                                                          
  <chr>    <chr>                                                                                             
1 can      Code canton. Pour les communes « multi-cantonales » code décliné de 99 à 90 (pseudo-canton) ou de…
Catégories
Non classé

Open and merge multiple shapefiles

or more precisely union many spatial tables in R in a tidy way.

  • dplyr::bind_rows doesn’t work on {sf} objects ;
  • base::rbind only works on two tables and so that’s not straightforward to use*.

So we’ll use purrr::map and tidyr::unnest.


It works again with vctrs 0.2.2 (February 2020)

As of October 2019 this method doesn’t work any longer, due to an update in the vctrs package.
https://github.com/r-spatial/sf/issues/1172.
So your best bet is to have the same structure in your shapefile and use :

dir_ls("shp", glob = "*.shp") %>%
map(read_sf) %>%
do.call(rbind, .)

First get some data, the communes of three french départements :

library(tidyverse)
library(sf)
library(fs)
library(httr)
library(leaflet)

# https://fr.actualitix.com/blog/shapefiles-des-departements-de-france.html

url <-  c("https://fr.actualitix.com/blog/actgeoshap/01-Ain.zip",
          "https://fr.actualitix.com/blog/actgeoshap/73-savoie.zip",
          "https://fr.actualitix.com/blog/actgeoshap/74-haute-savoie.zip")

dep <- str_extract(url, "\\d{2}.*$")

list(url, dep) %>% 
  pwalk(~ GET(.x, write_disk(.y)))

walk(dep, unzip, junkpaths = TRUE, exdir = "shp")

We can then create a 3 rows data frame containing a list-column in which we store the sf object. Then we just unnest it. This operation erases the sf-class, we have to add it back.

res <- dir_ls("shp", glob = "*.shp") %>% 
  tibble(fname = .) %>%
  mutate(data = map(fname, read_sf)) %>%
  unnest(data) %>%
  st_as_sf() %>%
  st_set_crs(2154)

write_sf(res, "shp/3dep.shp")

res %>% 
  st_transform(4326) %>% 
  leaflet() %>%
    addPolygons() %>% 
    addTiles()

Bonus : we have the source filename stored in the resulting shapefile.

* We could have used

dir_ls("shp", glob = "*.shp") %>% 
  map(read_sf) %>%
  do.call(rbind, .)

but the column structure doesn’t match here…