Day 2 of 30DayMapChallenge: « Lines » (previously).
We’ll make a map of the street name gender in Lyon. We need a database of french first names where we’ll find the gender. We will extract the Lyon streets from OpenStreetMap.
library(arrow)
library(dplyr)
library(tidyr)
library(readr)
library(purrr)
library(ggplot2)
library(stringr)
library(sf)
library(osmdata)
library(ggspatial)
library(glue)
library(knitr)
set.seed(42)
First names
if (!file.exists("freq_prenoms.rds")) {
<- read_parquet("https://www.insee.fr/fr/statistiques/fichier/8205621/prenoms-2023-nat.parquet") |>
freq_prenoms filter(preusuel != "_PRENOMS_RARES") |>
mutate(preusuel = iconv(preusuel, to = "ASCII//TRANSLIT")) |>
group_by(preusuel, sexe) |>
summarise(n = sum(nombre, na.rm = TRUE),
.groups = "drop_last") |>
mutate(total = sum(n)) |>
ungroup() |>
mutate(sexe = case_when(sexe == 1 ~ "M",
== 2 ~ "F",
sexe .default = NA_character_)) |>
pivot_wider(names_from = sexe,
values_from = n,
values_fill = 0) |>
mutate(across(c(M, F), \(x) x / total)) |>
write_rds("freq_prenoms.rds")
else {
} <- read_rds("freq_prenoms.rds")
freq_prenoms }
We have 34234 first names and their gender frequencies since 1900.
preusuel | total | M | F |
---|---|---|---|
ZENABOU | 48 | 0 | 1 |
EMILIENE | 25 | 0 | 1 |
KINGSLEY | 878 | 1 | 0 |
DOLOVAN | 73 | 1 | 0 |
ERCOLE | 67 | 1 | 0 |
YVA | 178 | 0 | 1 |
ISSEY | 79 | 1 | 0 |
SAWSSEN | 121 | 0 | 1 |
MISBAH | 24 | 0 | 1 |
GOHANN | 20 | 1 | 0 |
Map data
<- getbb("Lyon, France", featuretype = "city")
lyon_bbox
if (!file.exists("osm.rds")) {
<- opq(lyon_bbox) |>
lyon add_osm_features(features = c(
'"highway"="motorway"',
'"highway"="trunk"',
'"highway"="primary"',
'"highway"="secondary"',
'"highway"="tertiary"',
'"highway"="motorway_link"',
'"highway"="trunk_link"',
'"highway"="primary_link"',
'"highway"="secondary_link"',
'"highway"="tertiary_link"',
'"highway"="motorway_junction"',
'"highway"="unclassified"',
'"highway"="service"',
'"highway"="pedestrian"',
'"highway"="living_street"',
'"highway"="residential"')) |>
osmdata_sf() |>
pluck("osm_lines") |>
select(osm_id, name) |>
drop_na(name) |>
group_by(name) |>
summarise() |>
write_rds("osm.rds")
else {
} <- read_rds("osm.rds")
lyon }
That’s 3270 street names.
Finding first names in street names
We use a brute-force method: for each street we check if a part of its label is present in our list of female or male first names. We keep only first names with a high frequency in any of the genders.
<- freq_prenoms |>
female filter(F > .8,
str_length(preusuel) > 1,
!= "LA") |>
preusuel pull(preusuel)
<- freq_prenoms |>
male filter(M > .8,
str_length(preusuel) > 1) |>
pull(preusuel)
<- lyon |>
street_gender mutate(name = str_to_upper(iconv(name, to = "ASCII//TRANSLIT")),
f = str_extract_all(name, glue_collapse(female, sep = "\\b|\\b", last = "\\b")),
m = str_extract_all(name, glue_collapse(male, sep = "\\b|\\b", last = "\\b")),
gender = unlist(
map2(f, m,
~ case_when(length(.x) > length(.y) ~ "female",
length(.y) > length(.x) ~ "male",
identical(.x, character(0)) &
identical(.y, character(0)) ~ "not concerned",
length(.x) == length(.y) ~ "undecidable",
.default = NA_character_))))
name | geometry | f | m | gender |
---|---|---|---|---|
COURS DE VERDUN RECAMIER | LINESTRING (4.830426 45.748… | not concerned | ||
IMPASSE DES ANGLAIS | LINESTRING (4.795807 45.753… | not concerned | ||
RUE DES PROVENCES | LINESTRING (4.79335 45.7369… | not concerned | ||
CHEMIN DES PEUPLIERS | LINESTRING (4.866587 45.801… | not concerned | ||
ALLEE DU LEVANT | LINESTRING (4.878859 45.759… | not concerned | ||
RUE ROPOSTE | LINESTRING (4.866353 45.760… | not concerned | ||
ALLEE NELLIE BLY | LINESTRING (4.84882 45.7429… | NELLIE | female | |
QUAI JEAN MOULIN | MULTILINESTRING ((4.837853 … | JEAN | male | |
LA VIEILLE ROUTE | LINESTRING (4.769782 45.720… | not concerned | ||
AVENUE DE CHAMPAGNE | MULTILINESTRING ((4.796801 … | not concerned |
Map
<- c("female" = "lightpink1",
pal_mf "male" = "lightskyblue",
"undecidable" = "lightyellow4",
"not concerned" = "seashell2")
|>
street_gender mutate(gender = factor(gender, levels = names(pal_mf))) |>
st_set_crs("EPSG:4326") |>
ggplot() +
geom_sf(aes(color = gender),
linewidth = .5,
key_glyph = "timeseries") +
scale_color_manual(values = pal_mf) +
annotation_scale(bar_cols = c("darkgrey", "white"),
line_col = "darkgrey",
text_col = "darkgrey",
height = unit(0.1, "cm")) +
coord_sf(xlim = lyon_bbox[c(1, 3)],
ylim = lyon_bbox[c(2, 4)]) +
labs(title = "Gender in Lyon street names",
color = "",
caption = glue("Map data © OpenStreetMap contributors
using INSEE Fichier des prénoms 2023
r.iresmi.net - {Sys.Date()}")) +
theme_void() +
theme(plot.background = element_rect(color = NA,
fill = "white"),
plot.caption = element_text(size = 5,
color = "darkgrey"))
Possible miss-classifications
Lots of bias make this map unreliable, and would need manual editing…
epicenous first names
- some first names can be male or female (GWEN, CAMILLE, DOMINIQUE)
not concerned
- street names of people but without the first name (RUE VILLON),
- title instead of first name (RUE DE L’AMIRAL COURBET),
has a gender but shouldn’t
- common names used as first name (CHEMIN DE LA POMME), mainly for girls…
- strange first names (AUTOROUTE DU SOLEIL, Soleil seems to be a girl name…)
accidentally well classified
- the last name is also a first name (COURS BAYARD)