library(tidyverse)
library(lubridate)
library(ggplot2)Lab 7: Data Tidying, Transformation and Visualization with COVID-19 reporting Data
download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
destfile = "data/time_series_covid19_confirmed_global.csv")time_series_confirmed <- read_csv("data/time_series_covid19_confirmed_global.csv")|>
rename(Province_State = "Province/State", Country_Region = "Country/Region")Exercise Examples
time_series_confirmed_long <- time_series_confirmed |>
pivot_longer(-c(Province_State, Country_Region, Lat, Long),
names_to = "Date", values_to = "Confirmed") time_series_confirmed_long$Date <- mdy(time_series_confirmed_long$Date)Making Graphs from the time series Data
time_series_confirmed_long|>
group_by(Country_Region, Date) |>
summarise(Confirmed = sum(Confirmed)) |>
filter (Country_Region == "US") |>
ggplot(aes(x = Date, y = Confirmed)) +
geom_point() +
geom_line() +
ggtitle("US COVID-19 Confirmed Cases")time_series_confirmed_long |>
group_by(Country_Region, Date) |>
summarise(Confirmed = sum(Confirmed)) |>
filter (Country_Region %in% c("China","France","Italy",
"Korea, South", "US")) |>
ggplot(aes(x = Date, y = Confirmed, color = Country_Region)) +
geom_point() +
geom_line() +
ggtitle("COVID-19 Confirmed Cases")time_series_confirmed_long_daily <-time_series_confirmed_long |>
group_by(Country_Region, Date) |>
summarise(Confirmed = sum(Confirmed)) |>
mutate(Daily = Confirmed - lag(Confirmed, default = first(Confirmed )))time_series_confirmed_long_daily |>
filter (Country_Region == "US") |>
ggplot(aes(x = Date, y = Daily, color = Country_Region)) +
geom_point() +
ggtitle("COVID-19 Confirmed Cases")time_series_confirmed_long_daily |>
filter (Country_Region == "US") |>
ggplot(aes(x = Date, y = Daily, color = Country_Region)) +
geom_line() +
ggtitle("COVID-19 Confirmed Cases")time_series_confirmed_long_daily |>
filter (Country_Region == "US") |>
ggplot(aes(x = Date, y = Daily, color = Country_Region)) +
geom_smooth() +
ggtitle("COVID-19 Confirmed Cases")time_series_confirmed_long_daily |>
filter (Country_Region == "US") |>
ggplot(aes(x = Date, y = Daily, color = Country_Region)) +
geom_smooth(method = "gam", se = FALSE) +
ggtitle("COVID-19 Confirmed Cases")Animated Graphs with gganimate
library(gganimate)
library(gifski)
theme_set(theme_bw())daily_counts <- time_series_confirmed_long_daily |>
filter (Country_Region == "US")
p <- ggplot(daily_counts, aes(x = Date, y = Daily, color = Country_Region)) +
geom_point() +
ggtitle("Confirmed COVID-19 Cases") +
# gganimate lines
geom_point(aes(group = seq_along(Date))) +
transition_reveal(Date)
# make the animation
animate(p, renderer = gifski_renderer(), end_pause = 15)Animation of confirmed deaths
download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
destfile = "data/time_series_covid19_deaths_global.csv")time_series_deaths_confirmed <- read_csv("data/time_series_covid19_deaths_global.csv")|>
rename(Province_State = "Province/State", Country_Region = "Country/Region")
time_series_deaths_long <- time_series_deaths_confirmed |>
pivot_longer(-c(Province_State, Country_Region, Lat, Long),
names_to = "Date", values_to = "Confirmed")
time_series_deaths_long$Date <- mdy(time_series_deaths_long$Date)p <- time_series_deaths_long |>
filter (Country_Region %in% c("US","Canada", "Mexico","Brazil","Egypt","Ecuador","India", "Netherlands", "Germany", "China" )) |>
ggplot(aes(x=Country_Region, y=Confirmed, color= Country_Region)) +
geom_point(aes(size=Confirmed)) +
transition_time(Date) +
labs(title = "Cumulative Deaths: {frame_time}") +
ylab("Deaths") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
animate(p, renderer = gifski_renderer(), end_pause = 15)Exercises
Exercise 1: Go through Chapter 5 in R for Data Sciences - Data Tiyding and Pivot
5.2 Tidy Data
table1 |>
mutate(rate = cases / population * 10000)# A tibble: 6 × 5
country year cases population rate
<chr> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan 1999 745 19987071 0.373
2 Afghanistan 2000 2666 20595360 1.29
3 Brazil 1999 37737 172006362 2.19
4 Brazil 2000 80488 174504898 4.61
5 China 1999 212258 1272915272 1.67
6 China 2000 213766 1280428583 1.67
table1 |>
group_by(year) |>
summarize(total_cases = sum(cases))# A tibble: 2 × 2
year total_cases
<dbl> <dbl>
1 1999 250740
2 2000 296920
ggplot(table1, aes(x = year, y = cases)) +
geom_line(aes(group = country), color = "grey") +
geom_point(aes(color = country, shape = country)) +
scale_x_continuous(breaks = c(1999, 2000)) # x-axis breaks at 1999 and 20005.2.1 Exercise
1. For each of the sample tables, describe what each observation and each column represents.
table1# A tibble: 6 × 4
country year cases population
<chr> <dbl> <dbl> <dbl>
1 Afghanistan 1999 745 19987071
2 Afghanistan 2000 2666 20595360
3 Brazil 1999 37737 172006362
4 Brazil 2000 80488 174504898
5 China 1999 212258 1272915272
6 China 2000 213766 1280428583
table2# A tibble: 12 × 4
country year type count
<chr> <dbl> <chr> <dbl>
1 Afghanistan 1999 cases 745
2 Afghanistan 1999 population 19987071
3 Afghanistan 2000 cases 2666
4 Afghanistan 2000 population 20595360
5 Brazil 1999 cases 37737
6 Brazil 1999 population 172006362
7 Brazil 2000 cases 80488
8 Brazil 2000 population 174504898
9 China 1999 cases 212258
10 China 1999 population 1272915272
11 China 2000 cases 213766
12 China 2000 population 1280428583
table3# A tibble: 6 × 3
country year rate
<chr> <dbl> <chr>
1 Afghanistan 1999 745/19987071
2 Afghanistan 2000 2666/20595360
3 Brazil 1999 37737/172006362
4 Brazil 2000 80488/174504898
5 China 1999 212258/1272915272
6 China 2000 213766/1280428583
Table 1: The observations are representing the country and what year it is with number of TB cases and the total population in the country at the time. Each column represents country, year, cases, and population.
Table 2: The observations are the countries are oganized by cases and population with the year. It also gives the numerical value corresponding to cases or population. Each column represents country, year, type, count.
Table 3: The observations are the year for the country and the rate of TB by showing case/population. The columns represent country, year, and rate.
2. Sketch out the process you’d use to calculate the rate for table2 and table3. You will need to perform four operations:
Extract the number of TB cases per country per year.
Extract the matching population per country per year.
Divide cases by population, and multiply by 10000.
Store back in the appropriate place.
table2_cases <- table2 |>
filter (type == “cases”) |>
select (country, year, count)
table2_population <-table2 |>
filter (type == “cases”) |>
select (country, year, count)
All going into variable table2_cases_population
- Combine the two of these results and mutate it so that we can have cases / population and multiply by 10000
table3 can be sorted into two columns of just cases and population. Since it’s set up as a fraction you would filter the number at the beginning as one column and the number after the slash as the second column. After this is done then you can calculate cases/population for all the columns at once.
5.3 Lengthening Data
billboard |>
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank"
)# A tibble: 24,092 × 5
artist track date.entered week rank
<chr> <chr> <date> <chr> <dbl>
1 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk1 87
2 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk2 82
3 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk3 72
4 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk4 77
5 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk5 87
6 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk6 94
7 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk7 99
8 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk8 NA
9 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk9 NA
10 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk10 NA
# ℹ 24,082 more rows
billboard |>
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank",
values_drop_na = TRUE
)# A tibble: 5,307 × 5
artist track date.entered week rank
<chr> <chr> <date> <chr> <dbl>
1 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk1 87
2 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk2 82
3 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk3 72
4 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk4 77
5 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk5 87
6 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk6 94
7 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk7 99
8 2Ge+her The Hardest Part Of ... 2000-09-02 wk1 91
9 2Ge+her The Hardest Part Of ... 2000-09-02 wk2 87
10 2Ge+her The Hardest Part Of ... 2000-09-02 wk3 92
# ℹ 5,297 more rows
billboard_longer <- billboard |>
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank",
values_drop_na = TRUE
) |>
mutate(
week = parse_number(week)
)billboard_longer |>
ggplot(aes(x = week, y = rank, group = track)) +
geom_line(alpha = 0.25) +
scale_y_reverse()who2 |>
pivot_longer(
cols = !(country:year),
names_to = c("diagnosis", "gender", "age"),
names_sep = "_",
values_to = "count"
)# A tibble: 405,440 × 6
country year diagnosis gender age count
<chr> <dbl> <chr> <chr> <chr> <dbl>
1 Afghanistan 1980 sp m 014 NA
2 Afghanistan 1980 sp m 1524 NA
3 Afghanistan 1980 sp m 2534 NA
4 Afghanistan 1980 sp m 3544 NA
5 Afghanistan 1980 sp m 4554 NA
6 Afghanistan 1980 sp m 5564 NA
7 Afghanistan 1980 sp m 65 NA
8 Afghanistan 1980 sp f 014 NA
9 Afghanistan 1980 sp f 1524 NA
10 Afghanistan 1980 sp f 2534 NA
# ℹ 405,430 more rows
household |>
pivot_longer(
cols = !family,
names_to = c(".value", "child"),
names_sep = "_",
values_drop_na = TRUE
)# A tibble: 9 × 4
family child dob name
<int> <chr> <date> <chr>
1 1 child1 1998-11-26 Susan
2 1 child2 2000-01-29 Jose
3 2 child1 1996-06-22 Mark
4 3 child1 2002-07-11 Sam
5 3 child2 2004-04-05 Seth
6 4 child1 2004-10-10 Craig
7 4 child2 2009-08-27 Khai
8 5 child1 2000-12-05 Parker
9 5 child2 2005-02-28 Gracie
5.4 Widening Data
cms_patient_experience |>
distinct(measure_cd, measure_title)# A tibble: 6 × 2
measure_cd measure_title
<chr> <chr>
1 CAHPS_GRP_1 CAHPS for MIPS SSM: Getting Timely Care, Appointments, and Infor…
2 CAHPS_GRP_2 CAHPS for MIPS SSM: How Well Providers Communicate
3 CAHPS_GRP_3 CAHPS for MIPS SSM: Patient's Rating of Provider
4 CAHPS_GRP_5 CAHPS for MIPS SSM: Health Promotion and Education
5 CAHPS_GRP_8 CAHPS for MIPS SSM: Courteous and Helpful Office Staff
6 CAHPS_GRP_12 CAHPS for MIPS SSM: Stewardship of Patient Resources
cms_patient_experience |>
pivot_wider(
names_from = measure_cd,
values_from = prf_rate
)# A tibble: 500 × 9
org_pac_id org_nm measure_title CAHPS_GRP_1 CAHPS_GRP_2 CAHPS_GRP_3
<chr> <chr> <chr> <dbl> <dbl> <dbl>
1 0446157747 USC CARE MEDICA… CAHPS for MI… 63 NA NA
2 0446157747 USC CARE MEDICA… CAHPS for MI… NA 87 NA
3 0446157747 USC CARE MEDICA… CAHPS for MI… NA NA 86
4 0446157747 USC CARE MEDICA… CAHPS for MI… NA NA NA
5 0446157747 USC CARE MEDICA… CAHPS for MI… NA NA NA
6 0446157747 USC CARE MEDICA… CAHPS for MI… NA NA NA
7 0446162697 ASSOCIATION OF … CAHPS for MI… 59 NA NA
8 0446162697 ASSOCIATION OF … CAHPS for MI… NA 85 NA
9 0446162697 ASSOCIATION OF … CAHPS for MI… NA NA 83
10 0446162697 ASSOCIATION OF … CAHPS for MI… NA NA NA
# ℹ 490 more rows
# ℹ 3 more variables: CAHPS_GRP_5 <dbl>, CAHPS_GRP_8 <dbl>, CAHPS_GRP_12 <dbl>
cms_patient_experience |>
pivot_wider(
id_cols = starts_with("org"),
names_from = measure_cd,
values_from = prf_rate
)# A tibble: 95 × 8
org_pac_id org_nm CAHPS_GRP_1 CAHPS_GRP_2 CAHPS_GRP_3 CAHPS_GRP_5 CAHPS_GRP_8
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0446157747 USC C… 63 87 86 57 85
2 0446162697 ASSOC… 59 85 83 63 88
3 0547164295 BEAVE… 49 NA 75 44 73
4 0749333730 CAPE … 67 84 85 65 82
5 0840104360 ALLIA… 66 87 87 64 87
6 0840109864 REX H… 73 87 84 67 91
7 0840513552 SCL H… 58 83 76 58 78
8 0941545784 GRITM… 46 86 81 54 NA
9 1052612785 COMMU… 65 84 80 58 87
10 1254237779 OUR L… 61 NA NA 65 NA
# ℹ 85 more rows
# ℹ 1 more variable: CAHPS_GRP_12 <dbl>
Exercise 2: Instead of making a graph of 5 countries on the same graph as in the above example, use facet_wrap with scales=“free_y”.
five_countries <- c("US", "Afghanistan", "Brazil", "Cuba", "Egypt")
filtered_data <- time_series_confirmed_long |>
filter(Country_Region %in% five_countries)
ggplot(filtered_data, aes(x = Date, y = Confirmed)) +
geom_line(color = "blue") +
facet_wrap(vars(Country_Region), scales = "free_y", ncol = 3) +
ggtitle("COVID-19 Confirmed Cases")Exercise 3: Using the daily count of confirmed cases, make a single graph with 5 countries of your choosing.
time_series_confirmed_long_daily |>
group_by(Country_Region) |>
filter (Country_Region %in% c("Brazil","Denmark","US","Albania", "Cuba")) |>
ggplot(aes(x = Date, y = Daily, color = Country_Region)) +
geom_point() +
geom_line() ggtitle("Daily Count of COVID-19 Confirmed Cases")$title
[1] "Daily Count of COVID-19 Confirmed Cases"
attr(,"class")
[1] "labels"
Exercise 4: Plot the cumulative deaths in the US, Canada and Mexico (you will need to download time_series_covid19_deaths_global.csv)
time_series_deaths_long |>
filter(Country_Region %in% c("US", "Canada", "Mexico")) |>
ggplot(aes(x= Date, y= Confirmed, color= Country_Region)) +
geom_line() +
labs(title = "Cumulative COVID-19 Deaths",
x = "Date", y = "Deaths", color = "Country")Exercise 5: Make a graph with the countries of your choice using the daily deaths data
time_series_deaths_long |>
group_by(Country_Region) |>
mutate(Daily_Deaths = Confirmed - lag(Confirmed, default = 0)) |>
ungroup() |>
filter(Country_Region %in% c("Brazil", "Denmark", "US"))|>
ggplot(aes(x = Date, y = Daily_Deaths, color = Country_Region)) +
geom_line() +
ggtitle("Daily Deaths of COVID-19 Confirmed Cases")Exercise 6: Make an animation of your choosing (do not use a graph with geom_smooth)
p <- time_series_confirmed_long |>
filter(Country_Region %in% c("US", "Germany", "Egypt", "Netherlands")) |>
ggplot(aes(x=Country_Region, y=Confirmed, color = Country_Region)) +
geom_point(aes(size = Confirmed)) +
transition_time(Date) +
labs(title = "Total Cases Confirmed: {frame_time}") +
ylab("Confirmed") +
theme(axis.text.x = element_text(angle=90, vjust = 1, hjust = 1))
animate(p, renderer = gifski_renderer(), end_pause = 13)