Description


California Infectious Diseases includes the data for infectious diseases in the state of California, by county, gender, and year. This data comes from the California Health and Human Services website https://data.chhs.ca.gov/user/register, where Open Data is provided to the public.

In this R Markdown file, the data is analyzed for patterns and or anomalies, spatially and over time.

Copyright (C) 2018 Crista Moreno

California Infectious Diseases is free software: you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.

Load R Packages


library(ggplot2)
library(plyr)
library(dplyr)
library(magrittr) #for piping commands
library(data.table)
library(scales)
library(curl)
library(reshape)  #to melt the data and plot multiple columns on one graph
library(lubridate)
library(coefplot) #for plotting coefficients of regression
library(ggpmisc) #for plotting regression line on ggplot
library(maps)
#install.packages("usmap")
library(usmap)
library(ggmap)
library(mapdata)
library(stringr)
#devtools::install_github("dkahle/ggmap")

Load the Map Data for the United States


For this data analysis, county map data for the state of California is included. For an introduction on how to use map data with R and ggplot, the reader is referred to the following Github repository:

http://eriqande.github.io/rep-res-web/lectures/making-maps-with-R.html

states <- map_data("state")

Subset the State Map Data for the state of California


california <- subset(states, region == "california")
head(california)

Get the County Data for California


counties <- map_data("county")
california_counties <- subset(counties, region == "california")
california_counties %>% head

Plot the state of California


california_base <- ggplot(data = california, mapping = aes(x = long, y = lat, group = group)) + 
  coord_fixed(1.3) + 
  geom_polygon(color = "darkgreen", fill = "white")
california_base + theme_nothing()

Plot the California counties


california_base + theme_nothing() + 
  geom_polygon(data = california_counties, aes(group = group, fill = group), color = "white") +
  geom_polygon(color = "black", fill = NA)  # plot the state border on top

Note that the coloring of the counties does not have any meaning in the plot above.

Read the California State Infectious Disease data into a dataframe


data <- read.csv("infectious-disease-cases-by-county-year-and-sex.csv", header = TRUE)
data %>% head

Inspect the California Infectious Disease Data


data$Year %>% class
## [1] "factor"
#data%>% select(Year) %>% unique() %>% dim
#data[10622,]
#data$Unstable %>% unique
data$Sex %>% class
## [1] "factor"
data$Sex %>% unique %>% length
## [1] 19

There should only be three values for Sex for this data set. (This is not a political statement here, just a note on the data that was recorded. For this data set, the gender that was recorded fell into either Female, Male and Total).

genders <- c("Female", "Male", "Total")
data %>% filter(!(Sex %in% genders)) %>% select(Disease, County) %>% unique

This list of disease names shown above have spilled into the County column and in general shifted the data for these rows. For now, these diseases will be set aside.

Change Variable Type of Count from Factor to Numeric


#data %>% select(Count) %>% unique
data$Count %>% class
## [1] "factor"
#data$Count %>% as.character()
data$Count <- as.character(data$Count)
#data$Count
data$Count <- as.numeric(data$Count)
## Warning: NAs introduced by coercion
#data$Count
data$Population %>% class
## [1] "integer"
data$Disease %>% class
## [1] "factor"

Temporary Subsetting of the Data


Some of the rows have long disease names and have thus shifted the data to the next column. I will remove rows that have values other than Female, Male, or Total for the Sex column to work around this issue. This action significantly reduces the size of the data set and is not a permanent solution to the problem. So from this point on the analysis is based on a subset of the original data set, and is therefore not as reliable.

data <- data %>% filter(Sex %in% c("Female", "Male", "Total"))
data <- data %>% filter(Year %in% c(2001:2018))
data %>% select(Sex) %>% unique
data %>% select(Year) %>% unique

List the Counties for this data set


data %>% select(County) %>% unique()

List the Diseases for this data set


data$Disease %>% unique
##  [1] Amebiasis                                                                        
##  [2] Anthrax                                                                          
##  [3] Anaplasmosis and Ehrlichiosis                                                    
##  [4] Babesiosis                                                                       
##  [5] Cholera                                                                          
##  [6] Brucellosis                                                                      
##  [7] Campylobacteriosis                                                               
##  [8] Chlamydia                                                                        
##  [9] Dengue                                                                           
## [10] Ciguatera Fish Poisoning                                                         
## [11] Coccidioidomycosis                                                               
## [12] Creutzfeldt+AC0-Jakob Disease and other Transmissible Spongiform Encephalopathies
## [13] Cryptosporidiosis                                                                
## [14] Cyclosporiasis                                                                   
## [15] Cysticercosis or Taeniasis                                                       
## [16] Diphtheria                                                                       
## [17] Giardiasis                                                                       
## [18] HIV                                                                              
## [19] Domoic Acid Poisoning                                                            
## [20] E. coli O157                                                                     
## [21] E. coli Other STEC (non+AC0-O157)                                                
## [22] Early Syphilis                                                                   
## [23] Gonorrhea                                                                        
## [24] Hantavirus Infection                                                             
## [25] Hemolytic Uremic Syndrome                                                        
## [26] Hepatitis A                                                                      
## [27] Influenza Death (+ADw-65 years of age)                                           
## [28] Leprosy                                                                          
## [29] Invasive Meningococcal Disease                                                   
## [30] Legionellosis                                                                    
## [31] Leptospirosis                                                                    
## [32] Malaria                                                                          
## [33] Listeriosis                                                                      
## [34] Lyme Disease                                                                     
## [35] Measles                                                                          
## [36] Mumps                                                                            
## [37] Paralytic Shellfish Poisoning                                                    
## [38] Pertussis                                                                        
## [39] Q Fever                                                                          
## [40] Psittacosis                                                                      
## [41] Rubella                                                                          
## [42] Relapsing Fever                                                                  
## [43] Salmonellosis                                                                    
## [44] Tetanus                                                                          
## [45] Scombroid Fish Poisoning                                                         
## [46] Shiga Toxin Positive Feces (without culture confirmation)                        
## [47] Shigellosis                                                                      
## [48] Spotted Fever Rickettsiosis                                                      
## [49] Staphylococcus aureus Infection (cases resulting in death or ICU)                
## [50] Streptococcal Infection (cases in food and dairy workers)                        
## [51] Toxic Shock Syndrome (Non+AC0-Streptococcal)                                     
## [52] Trichinosis                                                                      
## [53] Tularemia                                                                        
## [54] Tuberculosis                                                                     
## [55] Typhus Fever                                                                     
## [56] Varicella Hospitalizations                                                       
## [57] Vibrio Infection (non+AC0-Cholera)                                               
## [58] Yersiniosis                                                                      
## 68 Levels:   +AC0- +ACI-Botulism +ACI-Hepatitis B ... Yersiniosis

Plot the E. coli O157 Disease Data for the year 2010


Store the disease data for the year 2010 in data_temp.

diseases <- c("E. coli O157")
years <- c(2010)

data_temp <- data %>% filter(Disease %in% diseases)%>% filter(Year %in% years) %>% filter(Count != 0)

# display disease data in decreasing order for Count
data_temp[order(-data_temp$Count),]
ggplot() +
  geom_bar(data=data_temp, aes(x = Sex, y=Count, fill=County), stat="identity") +
  #facet_wrap(~Sex) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 20)) +
  theme(axis.text.x = element_text(face="bold", size=9, angle=90))

# subset the disease data for Sex = Total
data_temp <- data_temp %>% filter(Sex %in% c("Total"))

# add a column to the disease data to represent percentage of the population with infection
data_temp$Percent_Population <- (data_temp$Count / data_temp$Population)*100

# change the county names to lowercase
data_temp$County <- data_temp$County %>% tolower() 

# rename the County column to subregion
data_temp$subregion <- data_temp$County

# remove any rows with Count equal to 0
data_temp <- data_temp %>% filter(!(Count ==0))
data_temp

Display the Joined Data


joined_data <- inner_join(california_counties, data_temp, by = "subregion")
#joined_data %>% object.size()
joined_data

Plot Map for Counties with Infections and color by the Percentage of Population Infected


very_simple_theme <- theme(
  axis.text = element_blank(),
  axis.line = element_blank(),
  axis.ticks = element_blank(),
  panel.border = element_blank(),
  panel.grid = element_blank(),
  axis.title = element_blank()
  )

disease_percent_population_map <- california_base + 
      geom_polygon(data = joined_data, aes(fill = Percent_Population), color = "white") +
      geom_polygon(color = "black", fill = NA) +
      theme_bw() +
      very_simple_theme

disease_percent_population_map 

disease_percent_population_map_scaled <- disease_percent_population_map + 
    scale_fill_gradientn(colours = rev(rainbow(7)),
                         breaks =waiver(), trans="log")

disease_percent_population_map_scaled