# Option 1: tidytuesdayR package
## install.packages("tidytuesdayR")
## install.packages("waldo")
## install.packages("tidytext")
## install.packages("textdata")
library(tidytuesdayR)
library(tidyverse)
library(waldo)
library(tidytext)
library(textdata)
Introduction
The latest TidyTuesday dataset was on births, deaths and other historical events that occurred in leap years, i.e. those years that include 29 February (such as 2024!). Further details are here.
Myriam led the session, and Antony provided additional code for performing text field analysis after the session.
Also, Emu the cat had the following contribution to make:
43e’/;£@@@@@@@@@@.1
The session
We started by loading some packages
We then had two ways of loading the data, in this case three datasets. As usual I’m switching to the url-based approach for the blog post
# tuesdata <- tidytuesdayR::tt_load('2024-02-27')
# ## OR
# tuesdata <- tidytuesdayR::tt_load(2024, week = 9)
# events <- tuesdata$events
# births <- tuesdata$births
# deaths <- tuesdata$deaths
# Option 2: Read directly from GitHub
<- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-02-27/events.csv')
events <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-02-27/births.csv')
births <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-02-27/deaths.csv') deaths
We noticed the births data include mention of at least one Pope. We wanted to explore more and less robust ways of finding popes in the births
and deaths
dataset
We could start by just looking for whether the word Pope
is in the person field of births
str_detect(births$person, "Pope")
[1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE
We then used a little expression to make the query not case sensitive:
%>% filter(str_detect(person, "(?i)Pope")) deaths
# A tibble: 1 × 4
year_death person description year_birth
<dbl> <chr> <chr> <dbl>
1 468 Pope Hilarius <NA> NA
Another approach is to use ignore_case
in the regex()
function:
%>% filter(str_detect(person, regex("pope", ignore_case = TRUE))) deaths
# A tibble: 1 × 4
year_death person description year_birth
<dbl> <chr> <chr> <dbl>
1 468 Pope Hilarius <NA> NA
The only persons with pope
in their name appear to be actual popes, not people who just happen to have the letters ‘pope’ in their surname.
Next we looked at number of events by year. We used two tidyverse approaches to producing this, one using group_by
and summarise
, the other using count
.
<- events %>%
number_events group_by(year) %>%
summarise(n= n())
number_events
# A tibble: 29 × 2
year n
<dbl> <int>
1 888 1
2 1504 1
3 1644 1
4 1704 1
5 1712 1
6 1720 1
7 1768 1
8 1796 1
9 1892 1
10 1908 1
# ℹ 19 more rows
<- events %>%
number_events_2 count(year)
number_events_2
# A tibble: 29 × 2
year n
<dbl> <int>
1 888 1
2 1504 1
3 1644 1
4 1704 1
5 1712 1
6 1720 1
7 1768 1
8 1796 1
9 1892 1
10 1908 1
# ℹ 19 more rows
We then tried different comparator functions to see if they all agreed the contents were identical, with some mixed and confusing results:
::compare(number_events, number_events_2) waldo
✔ No differences
waldo
says they are the same.
identical(number_events, number_events_2)
[1] FALSE
identical
says they are not identical
setequal(number_events, number_events_2)
[1] TRUE
But setequal
doesn’t find differences
all.equal(number_events, number_events_2)
[1] "Attributes: < Names: 1 string mismatch >"
[2] "Attributes: < Length mismatch: comparison on first 2 components >"
[3] "Attributes: < Component \"class\": Lengths (3, 4) differ (string compare on first 3) >"
[4] "Attributes: < Component \"class\": 3 string mismatches >"
[5] "Attributes: < Component 2: Modes: numeric, externalptr >"
[6] "Attributes: < Component 2: Lengths: 29, 1 >"
[7] "Attributes: < Component 2: target is numeric, current is externalptr >"
All equal reports a number of differences, related to the attributes (metadata) between the two objects being compared.
Curiouser and Curiouser…
Now let’s plot the number of events over time
%>%
number_events ggplot(aes(x = year, y = n))+
geom_col()
We wanted to know if there was anyone who was both recorded as being born and dying in a leap year:
<- births %>%
person_bd inner_join(deaths, by = "person")
person_bd
# A tibble: 1 × 7
year_birth.x person description.x year_death.x year_death.y description.y
<dbl> <chr> <chr> <dbl> <dbl> <chr>
1 1812 James Miln… Scottish-Aus… 1880 1880 Scottish-Aus…
# ℹ 1 more variable: year_birth.y <dbl>
One person (born in Scotland!)
We then looked text analysis, and in particular sentiment analysis of the content of the descriptio field:
%>%
births unnest_tokens(word, description) %>%
anti_join(get_stopwords()) %>%
left_join(get_sentiments("afinn"))
# A tibble: 432 × 5
year_birth person year_death word value
<dbl> <chr> <dbl> <chr> <dbl>
1 1468 Pope Paul III 1549 <NA> NA
2 1528 Albert V 1579 duke NA
3 1528 Albert V 1579 bavaria NA
4 1528 Domingo Báñez 1604 spanish NA
5 1528 Domingo Báñez 1604 theologian NA
6 1572 Edward Cecil 1638 1st NA
7 1572 Edward Cecil 1638 viscount NA
8 1572 Edward Cecil 1638 wimbledon NA
9 1576 Antonio Neri 1614 florentine NA
10 1576 Antonio Neri 1614 priest NA
# ℹ 422 more rows
%>%
events unnest_tokens(word, event) %>%
anti_join(get_stopwords()) %>%
left_join(get_sentiments("afinn"))
# A tibble: 418 × 3
year word value
<dbl> <chr> <dbl>
1 888 odo NA
2 888 count NA
3 888 paris NA
4 888 crowned NA
5 888 king NA
6 888 west NA
7 888 francia NA
8 888 france NA
9 888 archbishop NA
10 888 walter NA
# ℹ 408 more rows
Here’s the words in the afinn
object with the highest (most positive) sentiment
get_sentiments("afinn") %>%
arrange(desc(value))
# A tibble: 2,477 × 2
word value
<chr> <dbl>
1 breathtaking 5
2 hurrah 5
3 outstanding 5
4 superb 5
5 thrilled 5
6 amazing 4
7 awesome 4
8 brilliant 4
9 ecstatic 4
10 euphoric 4
# ℹ 2,467 more rows
And here’s an exploration of average sentiment by (leap)year based on the events description field:
|>
events unnest_tokens(word, event) |>
anti_join(get_stopwords()) |>
right_join(get_sentiments("afinn")) |>
group_by(year) |>
summarise(mean_sentiment = mean(value)) |>
ggplot(aes(x = year, y = mean_sentiment)) +
geom_point() +
geom_smooth()
Antony’s script
Load libraries
library(tidyverse)
library(lubridate)
library(countrycode)
some extra data sets re nationalities
<- readr::read_csv("https://raw.githubusercontent.com/knowitall/chunkedextractor/master/src/main/resources/edu/knowitall/chunkedextractor/demonyms.csv",
demonym col_names = c("demonym","geography"))
$demonym <- tolower(demonym$demonym)
demonym$geography <- tolower(demonym$geography)
demonym<- tibble(country=countrycode::codelist$country.name.en) country
Load data
# tuesdata <- tidytuesdayR::tt_load('2024-02-27')
# list2env(tuesdata,.GlobalEnv)
glimpse(events)
Rows: 37
Columns: 2
$ year <dbl> 888, 1504, 1644, 1704, 1712, 1720, 1768, 1796, 1892, 1908, 1912,…
$ event <chr> "Odo, count of Paris, is crowned king of West Francia (France) b…
glimpse(births)
Rows: 121
Columns: 4
$ year_birth <dbl> 1468, 1528, 1528, 1572, 1576, 1640, 1692, 1724, 1736, 1792…
$ person <chr> "Pope Paul III", "Albert V", "Domingo Báñez", "Edward Ceci…
$ description <chr> NA, "Duke of Bavaria", "Spanish theologian", "1st Viscount…
$ year_death <dbl> 1549, 1579, 1604, 1638, 1614, 1704, 1763, 1822, 1784, 1868…
glimpse(deaths)
Rows: 62
Columns: 4
$ year_death <dbl> 468, 992, 1460, 1528, 1592, 1600, 1604, 1712, 1744, 1792, …
$ person <chr> "Pope Hilarius", "Oswald of Worcester", "Albert III", "Pat…
$ description <chr> NA, "Anglo-Saxon archbishop and saint", "Duke of Bavaria-M…
$ year_birth <dbl> NA, 925, 1401, 1504, 1536, 1529, 1530, 1653, 1683, 1728, 1…
Which cohort of leap day births is most represented in Wikipedia’s data?
Are any years surprisingly underrepresented compared to nearby years?
What other patterns can you find in the data?
how many popes?
%>%
births mutate(is_pope = grepl("pope",tolower(paste(person,description)))) %>%
count(is_pope)
# A tibble: 2 × 2
is_pope n
<lgl> <int>
1 FALSE 120
2 TRUE 1
count births by century —-
<- function(year) {
getCenturyCorrected if (year %% 100 == 0) {
<- year / 100
century else {
} <- ceiling(year / 100)
century
}return(century)
}
getCenturyCorrected(1900)
[1] 19
getCenturyCorrected(1901)
[1] 20
%>%
births mutate(century=sapply(year_birth,getCenturyCorrected)) %>%
count(century)
# A tibble: 7 × 2
century n
<dbl> <int>
1 15 1
2 16 4
3 17 2
4 18 3
5 19 11
6 20 99
7 21 1
do count() and summarise(n=n()) give identical dataframes? Not always —-
<- births %>% count(year_birth)
x <- births %>% group_by(year_birth) %>% summarise(n=n())
y
identical(attributes(x), attributes(y))
[1] FALSE
names(x)==names(y)
[1] TRUE TRUE
identical(
x,
y )
[1] FALSE
a rough stab (clearly flawed) at parsing nationality —-
<-
births_nationality bind_rows(
%>%
births ::unnest_tokens(word, description) %>%
tidytextanti_join(tidytext::get_stopwords(), "word") %>%
left_join(
demonym,by = c(word = "geography"),
relationship = "many-to-many"
%>%
) left_join(demonym, by = "demonym"),
%>%
births ::unnest_tokens(word, description) %>%
tidytextanti_join(tidytext::get_stopwords(), "word") %>%
left_join(demonym, c(word = "demonym")) %>%
left_join(demonym, "geography", relationship = "many-to-many")
)
%>% count(geography) %>% arrange(-n) births_nationality
# A tibble: 35 × 2
geography n
<chr> <int>
1 <NA> 732
2 united states 216
3 australia 40
4 england 39
5 canada 24
6 zealand 12
7 spain 10
8 wales 8
9 france 6
10 turkey 6
# ℹ 25 more rows
Now a pretty wordcloud
%>%
events unnest_tokens(word, event) %>%
anti_join(get_stopwords(),"word") %>%
count(word) %>%
::wordcloud(words = .$word,
{wordcloudfreq = .$n, min.freq = 1,
max.words = 20, random.order = FALSE, rot.per = 0.35,
colors = RColorBrewer::brewer.pal(8, "Dark2"))}
A neutral word should have a sentiment score of 0, not NA. Let’s make that change…
<- get_sentiments('afinn')
afinn_sentiments
%>%
events unnest_tokens(word, event) %>%
anti_join(get_stopwords(),"word") %>%
left_join(afinn_sentiments,"word") %>%
# filter(!is.na(value)) %>%
replace_na(list(value=0)) %>%
mutate(century = sapply(year,getCenturyCorrected)) %>%
group_by(century) %>%
summarise(mean_sentiment = mean(value)) %>%
ggplot(aes(x=century,y=mean_sentiment))+geom_line()
Footnotes
I don’t think even regex can help us with this one.↩︎