ISCII Covid Official Data Analysis (Last update 07 mayo, 2023)

Download data

This document downloads and analyses data providad by “Instituto de la Salud Carlos III” associated with the following data file https://cnecovid.isciii.es/covid19/resources/casos_hosp_uci_def_sexo_edad_provres.csv.

rm(list=ls())

setwd("C:/Users/rominsol/Mi unidad/uc3m/IntroductiontoDataScience/MaterialCurso2022_23/Class14")

print('Download data file https://cnecovid.isciii.es/covid19/resources/casos_hosp_uci_def_sexo_edad_provres.csv')
## [1] "Download data file https://cnecovid.isciii.es/covid19/resources/casos_hosp_uci_def_sexo_edad_provres.csv"
print(paste(c('and storage it in the folder ', getwd())))
## [1] "and storage it in the folder "                                                          
## [2] "C:/Users/rominsol/Mi unidad/uc3m/IntroductiontoDataScience/MaterialCurso2022_23/Class14"
#input('con el nombre dataMOMO.csv');
url ='https://cnecovid.isciii.es/covid19/resources/casos_hosp_uci_def_sexo_edad_provres.csv';
filename = 'dataISCIII.csv';
if(!file.exists(filename)){
  download.file(url, filename)
}
aux=installed.packages()[,"Package"]

#
if(!("lubridate"%in%installed.packages()[,"Package"])){
  install.packages("lubridate", repos="http://cran.rstudio.com/", dependencies=TRUE)  
}
library("lubridate")
## Loading required package: timechange
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Load current ISCIII data set 
data = read.csv(filename,
                header=TRUE,
                sep=",",
                dec=".",
                stringsAsFactors=TRUE,
                encoding="UTF-8")

Once the data is available in the environment, we take a look to the structure of the information content. Note that some of the variables are storaged as factor, including the date. The rest of variables correspond to integers:

head(data,15)
##    provincia_iso sexo grupo_edad      fecha num_casos num_hosp num_uci num_def
## 1              A    H        0-9 2020-01-01         0        0       0       0
## 2              A    H      10-19 2020-01-01         0        0       0       0
## 3              A    H      20-29 2020-01-01         0        0       0       0
## 4              A    H      30-39 2020-01-01         0        0       0       0
## 5              A    H      40-49 2020-01-01         0        0       0       0
## 6              A    H      50-59 2020-01-01         0        0       0       0
## 7              A    H      60-69 2020-01-01         0        0       0       0
## 8              A    H      70-79 2020-01-01         0        0       0       0
## 9              A    H        80+ 2020-01-01         0        0       0       0
## 10             A    H         NC 2020-01-01         0        0       0       0
## 11             A    M        0-9 2020-01-01         0        0       0       0
## 12             A    M      10-19 2020-01-01         0        0       0       0
## 13             A    M      20-29 2020-01-01         0        0       0       0
## 14             A    M      30-39 2020-01-01         0        0       0       0
## 15             A    M      40-49 2020-01-01         0        0       0       0

Before starting the exploratory data analysis, we proceed to generate three additional auxiliary variables associated with date, the day of the year and the year information for each record, respectively:

# Create dates
data$date=as.Date(data$fecha)
# Create the days within() the year as a factor and include it in the data frame
data$yearday=as.factor(yday(data$date))
# Create the years within the year as a factor and include it in the data frame
data$year=as.factor(year(data$date))
str(data)
## 'data.frame':    1299030 obs. of  11 variables:
##  $ provincia_iso: Factor w/ 52 levels "A","AB","AL",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sexo         : Factor w/ 3 levels "H","M","NC": 1 1 1 1 1 1 1 1 1 1 ...
##  $ grupo_edad   : Factor w/ 10 levels "0-9","10-19",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ fecha        : Factor w/ 817 levels "2020-01-01","2020-01-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ num_casos    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ num_hosp     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ num_uci      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ num_def      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ date         : Date, format: "2020-01-01" "2020-01-01" ...
##  $ yearday      : Factor w/ 366 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year         : Factor w/ 3 levels "2020","2021",..: 1 1 1 1 1 1 1 1 1 1 ...

Graphs and frequency tables

library("ggplot2")

We initially represent the temporal daily evolution of deaths during pandemic. Since the initial information is distributed by different age groups and sex categories, we need to aggregate the numbers for all possible values of these two categorical variables:

dailydatspain=aggregate(cbind(num_casos=data$num_casos,
                              num_hosp=data$num_hosp,
                              num_uci=data$num_uci,
                              num_def=data$num_def
                              ), 
                        by=list(fecha=data$fecha,
                                yearday=data$yearday,
                                 year=data$year), 
                        FUN=sum)
str(dailydatspain)
## 'data.frame':    817 obs. of  7 variables:
##  $ fecha    : Factor w/ 817 levels "2020-01-01","2020-01-02",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ yearday  : Factor w/ 366 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ year     : Factor w/ 3 levels "2020","2021",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ num_casos: int  0 0 0 0 0 0 0 0 1 1 ...
##  $ num_hosp : int  8 75 8 6 7 5 6 4 8 9 ...
##  $ num_uci  : int  0 1 1 0 0 0 1 0 0 0 ...
##  $ num_def  : int  0 0 0 0 0 0 0 0 0 0 ...
f=ggplot(data=dailydatspain)+
  aes(x=fecha)+
  geom_point(aes(y=num_def),shape=23,colour="black",fill="blue")+
  xlab("Date")+
  ylab("COVID deaths")+
  scale_x_discrete(
    breaks=dailydatspain$fecha[seq(1, length(dailydatspain$fecha), by=15)],
    labels = dailydatspain$fecha[seq(1, length(dailydatspain$fecha), by=15)]
    )+
  theme(axis.text.x = element_text(angle = 90,hjust=1),text = element_text(size = 20))

ggsave(gsub(" ",
            "",
            paste("RGraphs/ISCIII_Spain.png"),
            fixed = TRUE),
       width=30,
       height=20,
       units = "cm") 

print(f)

Now we make the same plot but collapsed within one year and using different colors depending on the year of the record:

breaks=as.character(cumsum(c(31,31,28,30,31,30,31,31,30,31,30,32)))
labels = c("Jan",
               "Feb",
               "Mar",
               "Apr",
               "May",
               "Jun",
               "Jul",
               "Aug",
               "Sep",
               "Oct",
               "Nov",
               "Dec")

f=ggplot(data=dailydatspain)+
  aes(x=yearday,colour=year,fill=year)+
  geom_point(aes(y=num_def))+
  xlab("Date")+
  ylab("COVID deaths")+
  scale_x_discrete(
    breaks=breaks,
    labels =labels
  )+
  theme(axis.text.x = element_text(hjust=1))+
  ggtitle(paste("Datos procedentes del ISCII (",Sys.Date(),")"))+
  theme(plot.title = element_text(size=15,hjust = 0.5),text = element_text(size = 15),axis.text = element_text(angle = 45))

ggsave(gsub(" ",
            "",
            paste("RGraphs/ISCIII_Spain2.png"),
            fixed = TRUE),
       width=30,
       height=20,
       units = "cm") 

print(f)

For making two-way or contingency tables, it is convenient to organize the number of deaths by province and age group:

datprovincias=aggregate(list(num_def=data$num_def),
                        by=list(provincias=data$provincia_iso,
                                grupo_edad=data$grupo_edad),
                        FUN=sum)


str(datprovincias)
## 'data.frame':    520 obs. of  3 variables:
##  $ provincias: Factor w/ 52 levels "A","AB","AL",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ grupo_edad: Factor w/ 10 levels "0-9","10-19",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ num_def   : int  0 1 1 0 2 0 1 0 0 1 ...

With this new information available it is possible to construct the following table:

ctable=matrix(datprovincias$num_def, nrow = length(levels(data$grupo_edad)), byrow = TRUE)
rownames(ctable)=levels(data$grupo_edad)
colnames(ctable)=levels(data$provincia_iso)
ctable=cbind(ctable,apply(ctable,1,sum))
ctable=rbind(ctable,apply(ctable,2,sum))
rownames(ctable)[nrow(ctable)]="All"
colnames(ctable)[ncol(ctable)]="SP"
print(t(ctable))
##    0-9 10-19 20-29 30-39 40-49 50-59 60-69 70-79   80+  NC    All
## A    0     1     3    10    58   160   407   869  1956   0   3464
## AB   1     0     1     1     4    37    86   219   634   0    983
## AL   1     0     7    14    46    80   178   297   526   0   1149
## AV   0     0     1     0     3    17    49    89   416   0    575
## B    2     3    12    27   134   456  1324  3062  9545   0  14565
## BA   0     0     2     5    10    56   130   244   741   1   1189
## BI   1     0     2    14    30    80   246   566  2115   0   3054
## BU   0     0     0     1     7    25    72   182   799   0   1086
## C    0     0     0     3    10    43    96   302   924   0   1378
## CA   1     1     7    14    26   126   274   464   851   0   1764
## CC   1     0     1     0     8    31    94   190   746   0   1071
## CE   0     0     2     0     4     9    42    34    55   0    146
## CO   0     0     1     6    16    49   133   286   829   0   1320
## CR   0     1     5     8    26    60   196   427  1177   0   1900
## CS   0     0     1     1    13    41   105   241   651   0   1053
## CU   0     0     0     1     4    27    58   123   497   0    710
## GC   1     0     1     7    21    50   127   175   317   0    699
## GI   1     0     3     6    22    86   191   371  1177   0   1857
## GR   0     1    10    18    34   105   257   513  1139   0   2077
## GU   0     0     0     1    11    20    54   133   534   0    753
## H    0     0     0     3    11    27    57   128   259   0    485
## HU   0     0     1     4     3    20    55   120   536   4    743
## J    0     2     3     3    23    78   159   286   768   0   1322
## L    0     0     1     3     9    36    84   153   618   0    904
## LE   0     0     1     2     4    28    77   191   808   0   1111
## LO   0     1     0     4    15    24    61   180   627   0    912
## LU   0     0     0     0     1     5    26    42   254   0    328
## M    5     5    21    49   243   760  2045  4527 11278   0  18933
## MA   2     3    16    18    35   150   328   628  1172   0   2352
## ML   0     0     0     2     4    16    21    36    75   0    154
## MU   0     0     1     7    39   105   258   431  1375   0   2216
## NC   0     0     0     2     5    15    22    60   142 184    430
## O    0     0     1     5    17    73   220   485  1802   0   2603
## OR   0     0     0     1     0    14    20    93   417   0    545
## P    0     0     0     1     8    18    66   128   444   0    665
## PM   0     3     2     5    19    53   157   331   807   0   1377
## PO   0     0     1     1     3    38    71   182   651   0    947
## S    0     0     0     1     6    23    53   136   570   0    789
## SA   1     0     1     2     6    24    91   184   879   1   1189
## SE   1     1    10    25    49   203   441   720  1433   0   2883
## SG   0     0     0     0     5    16    51   101   490   0    663
## SO   0     0     0     0     4    12    33    70   370   0    489
## SS   1     0     1     8    17    76   186   413  1349   0   2051
## T    0     2     2     6    14    46   133   311   960   0   1474
## TE   1     1     1     0     6    14    38    71   410   1    543
## TF   2     0     5     8    17    73   122   250   460   0    937
## TO   1     0     4     7    25    96   308   570  1744   0   2755
## V    0     1     4    20    43   174   507  1023  2842   0   4614
## VA   0     1     0     6     7    51   149   353  1316   0   1883
## VI   0     0     3     1    10    23    75   214   585   0    911
## Z    1     1     1    10    26   107   294   603  2305  16   3364
## ZA   0     0     0     1     4    12    57   106   480   0    660
## SP  24    28   139   342  1165  3968 10384 21913 63855 207 102025

If we are just interested in number of deaths per age group regardless of the province:

library("knitr")
prop=prop.table(ctable[1:(nrow(ctable)-1),ncol(ctable)])
prop=cbind(100*prop,100*cumsum(prop),ctable[1:(nrow(ctable)-1),ncol(ctable)],cumsum(ctable[1:(nrow(ctable)-1),ncol(ctable)]))
colnames(prop)=c("% deaths","% Cum deaths","Deaths","Cum deaths")
kable(prop,caption="Proportion of deaths by age group")
Proportion of deaths by age group
% deaths % Cum deaths Deaths Cum deaths
0-9 0.0235236 0.0235236 24 24
10-19 0.0274443 0.0509679 28 52
20-29 0.1362411 0.1872090 139 191
30-39 0.3352120 0.5224210 342 533
40-49 1.1418770 1.6642980 1165 1698
50-59 3.8892428 5.5535408 3968 5666
60-69 10.1778976 15.7314384 10384 16050
70-79 21.4780691 37.2095075 21913 37963
80+ 62.5876011 99.7971086 63855 101818
NC 0.2028914 100.0000000 207 102025

Next we want to show the information storage in the “datprovincias” data frame using a box plot. Note that we have to use a logarithmic scale in the vertical axis:

f=ggplot(data=datprovincias)+
  aes(x=grupo_edad,colour=grupo_edad)+
  geom_boxplot(aes(y=num_def))+
  geom_jitter(aes(y=num_def))+
  xlab("Age group")+
  ylab("COVID deaths by province")+
  scale_y_log10()+
  ggtitle(paste("Official COVID deaths by province and age group (",Sys.Date(),")"))+
  theme(plot.title = element_text(hjust = 0.5),text = element_text(size = 20),legend.position = "none")

  ggsave(gsub(" ",
            "",
            paste("RGraphs/ISCIII_Boxplot_Age_Province.png"),
            fixed = TRUE),
       width=30,
       height=20,
       units = "cm") 
  
  print(f)

We could present the same data but acumulated by age group: