1 Load Data and Libraries

library(maptools, quietly = T)
library(sp)
library(spdep, quietly = T)
library(INLA, quietly = T)
library(ggplot2)
# library(kableExtra)
library(flextable)

testDat<-readRDS("/Users/charlie/OneDrive - NYU Langone Health/covidINLA/testDat.RDS")
nyc <- readRDS("/Users/charlie/OneDrive - NYU Langone Health/covidINLA/nycMap4COVID.RDS")

nycDat <- attr(nyc, "data")
order <- match(nycDat$ZCTA5CE10,testDat$zcta)
testDat<- testDat[order,]
testDat$ID.area<-seq(1,nrow(testDat))

nyc.adj <- paste("/Users/charlie/OneDrive - NYU Langone Health/covidINLA/nyc.graph")

library(DescTools)
options(scipen = 10) # don't use scientific notation
options(fmt.num=structure(list(digits=2, big.mark=","), class="fmt")) # force use commas as thousands separator and 2 decimal places

2 Descriptive Statistics

Desc(testDat$rate, main="Positive COVID-19 Tests per 10,000 ZCTA Population")

Desc(testDat$prop, main="Positive COVID-19 Tests per 10,000 ZCTA Tests")

# ZCTA's with highest and lowest rate and proportions. 

testDat$zcta[testDat$rate %in% head(testDat$rate)]
testDat$zcta[testDat$rate %in% tail(testDat$rate)]

testDat$zcta[testDat$prop %in% head(testDat$prop)]
testDat$zcta[testDat$prop %in% tail(testDat$prop)]

# table highest and lowest quantiles

cut.rate<-quantile(testDat$rate,probs = seq(0, 1, by = 0.20))
testDat$rate.cut<-cut(testDat$rate,breaks=cut.rate,include.lowest=TRUE)

testDat$rate.tab<-NA
testDat$rate.tab[testDat$rate.cut=="[42,105]"]<-"Low"
testDat$rate.tab[testDat$rate.cut=="(222,323]"]<-"High"

tab1<-data.frame(TOne(testDat[,c("MHI","schDen","POPDEN2010","houseDensity1","congdonIndex","propBlack","propOld","propHisp","heartDisease","COPD")], grp=testDat[,c("rate.tab")], vnames=c("MHI", "School Density", "Population Density", "Housing Density", "Congdon Index", "Black", "Hispanic", "Heart Disease", "COPD"))[1:10,1:5])

regulartable(data.frame(tab1[1:10,]), cwidth=1.2)

#Choropleths of Spatial Risk

2.1 Unadjusted Rates

We start with a simple choropleth of postive COVID-19 tests per 10,000 ZCTA population.

cut.rate<-quantile(testDat$rate,probs = seq(0, 1, by = 0.20))
testDat$rate.cut<-cut(testDat$rate,breaks=cut.rate,include.lowest=TRUE)
# summary(testDat$rate.cut)

nyc.df <- fortify(nyc, region='ZCTA5CE10')
nyc.df <- merge(nyc.df, testDat, by.x="id", by.y="zcta", all=FALSE)
nyc.df <- nyc.df[order(nyc.df$group, nyc.df$order), ]

p1<-ggplot(nyc.df, aes(x=long, y=lat))
p2<-p1+geom_polygon(aes(group=group, fill=rate.cut))
p3<- p2 + scale_fill_brewer(palette="Blues", name="Fitted Values")
p4<- p3+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
axis.ticks = element_blank()) +
theme(panel.background = element_rect(colour = NA)) +
xlab("")+ylab("")+
ggtitle("Positive COVID19 Tests per 10,000 Population,
       New York City ZIP Code Tabulation Areas April 3-22, 2019")
print(p4)

Next, we map the proportion of positive COVID-19 tests per 10,000 positive tests.

cut.prop<-quantile(testDat$prop,probs = seq(0, 1, by = 0.20))
testDat$prop.cut<-cut(testDat$prop,breaks=cut.prop,include.lowest=TRUE)
# summary(testDat$prop.cut)

nyc.df <- fortify(nyc, region='ZCTA5CE10')
nyc.df <- merge(nyc.df, testDat, by.x="id", by.y="zcta", all=FALSE)
nyc.df <- nyc.df[order(nyc.df$group, nyc.df$order), ]

p1<-ggplot(nyc.df, aes(x=long, y=lat))
p2<-p1+geom_polygon(aes(group=group, fill=prop.cut))
p3<- p2 + scale_fill_brewer(palette="Blues", name="Fitted Values (Quantiles)", labels= c("2590, 4360", "4360, 5090", "5090, 5640", "5640, 5890", "5890, 7330" ))
p4<- p3 + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
axis.ticks = element_blank()) +
theme(panel.background = element_rect(colour = NA)) +
xlab("Latitude (74.2179 W)")+ylab("Longitude (43.2994 N)")+
ggtitle("Positive COVID19 Tests per 10,000 Tests,
       New York City ZIP Code Tabulation Areas April 3-22, 2019")+ theme(plot.title = element_text(hjust = 0.5))
print(p4)

ggsave("/Users/charlie/OneDrive\ -\ NYU\ Langone\ Health/covidINLA/covidINLAwriteup/covidINLA_annalsEpi/covidINLA_annalsEpiRev1/covidINLA_annalsRev1FIG2.jpg", p4)

2.2 Frailty and Convolution Models

The first (frailty) model illustrates the random effect term, with no explicit spatial component.

formulaUH <- Positive~ f(ID.area, model = "iid", offset(Total)) # specify model
resultUH <- inla(formulaUH,family="poisson", data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultUH)
# exp(resultUH$summary.fixed)

# mean(testDat$Positive, na.rm=T) # model results differ because adjusted for population

The line graph plots the density of the random effect term around the mean value restricted on some of the negative values to better illustrated the (approximately) normal distribution of the random effects.

re1<-resultUH$summary.random$ID.area[,2]
# plot(density(re1)) # plot density random effects

plot(density(re1[re1>-2e-04]), main="Density Plot Random Effects Frailty Model") # restrict

We next map the random effects term to illustrate the distribution of the unstructured heterogeneity or variance. The results (as expected) appear generally as a spatially random distribution of variance above and below the mean value.

# add random effects results to dataframe
testDat$UH<-re1

# create cuts
cuts <-quantile(testDat$UH,probs = seq(0, 1, by = 0.20))
testDat$UH.cut<-cut(testDat$UH,breaks=cuts,include.lowest=TRUE)

#merge dataframes, save to data slot of map
testDat$ZCTA5CE10<-testDat$zcta
attr(nyc,"data") <- merge(nycDat,testDat,by="ZCTA5CE10")

nyc.df <- fortify(nyc, region='ZCTA5CE10')
nyc.df <- merge(nyc.df, testDat, by.x="id", by.y="zcta", all=FALSE)
nyc.df <- nyc.df[order(nyc.df$group, nyc.df$order), ]

library(RColorBrewer)
p1<-ggplot(nyc.df, aes(x=long, y=lat))
p2<-p1+geom_polygon(aes(group=group, fill=UH.cut))
p3<- p2 + scale_fill_brewer(palette="Blues", name="Random\nEffects")
p4<- p3+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
axis.ticks = element_blank()) +
theme(panel.background = element_rect(colour = NA)) +
xlab("")+ylab("")+
ggtitle("Unstructured Heterogeneity (Above and Below Mean),  Positive COVID19 Tests, 
       New York City ZIP Code Tabulation Areas April 3-8, 2019")
print(p4)

2.3 Spatially Structured (Convolution) Model

The convolution model adds a spatially-structured conditional autoregression term to the spatially-unstructured heterogeneity random effect term of the frailty model.

formulaCAR <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)
resultCAR <- inla(formulaCAR,family="poisson", data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCAR)

# exp(resultCAR$summary.fixed)

re<-resultCAR$summary.random$ID.area[1:length(testDat$zcta),2]
# plot(density(re[re>-0.002]), main="Density Plot Random Effects Convolution Model") # plot density random effects

car<-resultCAR$summary.random$ID.area[(length(testDat$zcta)+1):(2*length(testDat$ZCTA)),2]
# plot(density(car[car>-0.0005]),main="Density Plot Spatial Effects Convolution Model") # plot density spatial effects

We next map the results of the spatial heterogeneity, demonstrating the variance values above and below zero,which appears to have greater spatial structure than in the simple frailty model, with some ZCTAs demonstrating higher variance estimates than others.

# add car results to dataframe
testDat$car<-car

# create cuts
cuts <- quantile(testDat$car,probs = seq(0, 1, by = 0.20))
summary(cuts)
testDat$car.cut<-cut(testDat$car,breaks=cuts, include.lowest=TRUE, include.highest=TRUE)

nyc.df <- fortify(nyc, region='ZCTA5CE10')
nyc.df <- merge(nyc.df, testDat, by.x="id", by.y="zcta", all=FALSE)
nyc.df <- nyc.df[order(nyc.df$group, nyc.df$order), ]

# library(RColorBrewer)
p1<-ggplot(nyc.df, aes(x=long, y=lat))
p2<-p1+geom_polygon(aes(group=group, fill=car.cut))
p3<- p2 + scale_fill_brewer(palette="Blues", name="Spatial\nEffects")
p4<- p3+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
axis.ticks = element_blank()) +
theme(panel.background = element_rect(colour = NA)) +
xlab("")+ylab("")+
ggtitle("Spatially Structured (CAR) Heterogeneity,Positive COVID19 Tests, 
       New York City ZIP Code Tabulation Areas April 3-22, 2019")
print(p4)

2.4 Maps of Model Results

Using calcuations from the fitted convolution model, we first map the so-called fitted variance values, which are the sum of the mean value, plus the unstructured and spatially structured variance components (\(\theta = \alpha + \upsilon + \nu\)).

#risk (theta = alpha + upsilon + nu)
CARfit <- resultCAR$summary.fitted.values[,1]

testDat$fit<-CARfit
cut.fit<-quantile(testDat$fit,probs = seq(0, 1, by = 0.20))
testDat$fit.cut<-cut(testDat$fit,breaks=cut.fit,include.lowest=TRUE)
# summary(testDat$fit.cut)

nyc.df <- fortify(nyc, region='ZCTA5CE10')
nyc.df <- merge(nyc.df, testDat, by.x="id", by.y="zcta", all=FALSE)
nyc.df <- nyc.df[order(nyc.df$group, nyc.df$order), ]

p1<-ggplot(nyc.df, aes(x=long, y=lat))
p2<-p1+geom_polygon(aes(group=group, fill=fit.cut))
p3<- p2 + scale_fill_brewer(palette="Blues", name="Fitted Values")
p4<- p3+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
axis.ticks = element_blank()) +
theme(panel.background = element_rect(colour = NA)) +
xlab("")+ylab("")+
ggtitle("Fitted Model Values, Heterogeneity,Positive COVID19 Tests, 
       New York City ZIP Code Tabulation Areas April 3-22, 2019")
print(p4)

Next, the spatial risk estimate is calculated as the sum of the unstructured and spatially structured variance components ((\(\zeta = \upsilon + \nu\)))

formulaCAR <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)
resultCAR <- inla(formulaCAR,family="poisson", data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))


#spatial risk (zeta = upsilon + nu)
CARmarginals <- resultCAR$marginals.random$ID.area[1:177]
CARzeta <- lapply(CARmarginals,function(x)inla.emarginal(exp,x)) # exponentiate

testDat$risk<-unlist(CARzeta)
cut.risk<-quantile(testDat$risk, probs = seq(0, 1, by = 0.20))
testDat$risk.cut<-cut(testDat$risk,breaks=cut.risk,include.lowest=TRUE)

testDat$risk.cut2<-cut(testDat$risk,breaks=c(0, 1, 1.001, 1.002, 1.003))


# summary(testDat$risk.cut)

nyc.df <- fortify(nyc, region='ZCTA5CE10')
nyc.df <- merge(nyc.df, testDat, by.x="id", by.y="zcta", all=FALSE)
nyc.df <- nyc.df[order(nyc.df$group, nyc.df$order), ]

p1<-ggplot(nyc.df, aes(x=long, y=lat))
p2<-p1+geom_polygon(aes(group=group, fill=risk.cut2))
p3<- p2 + scale_fill_brewer(palette="Blues", name="Spatial\nRisk")
p4<- p3+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
axis.ticks = element_blank()) +
theme(panel.background = element_rect(colour = NA)) +
xlab("Latitude (74.2179 W)")+ylab("Longitude (43.2994 N)") +
ggtitle("Spatially Structured Risk Variance Estimates, Positive COVID19 Tests, 
       New York City ZIP Code Tabulation Areas April 3-22, 2019")+ theme(plot.title = element_text(hjust = 0.5))
print(p4)

ggsave("/Users/charlie/OneDrive\ -\ NYU\ Langone\ Health/covidINLA/covidINLAwriteup/covidINLA_annalsEpi/covidINLA_annalsEpiRev1/covidINLA_annalsRev1FIG3.jpg", p4)

Finally, we estimate the proportion of the variance explained by geographic variation or place, which for this model is approximately 32%

mat.marg <- matrix(NA, nrow=177, ncol=1000)
m<-resultCAR$marginals.random$ID.area
for (i in 1:177){
  u<-m[[177+i]]
  s<-inla.rmarginal(1000, u)
  mat.marg[i,]<-s}
var.RRspatial<-mean(apply(mat.marg, 2, sd))^2
var.RRhet<-inla.emarginal(function(x) 1/x,
        resultCAR$marginals.hyper$"Precision for ID.area (iid component)")
# var.RRspatial/(var.RRspatial+var.RRhet)  # 0.3200823

3 Unadjusted Single Covariate Models of Associations with Positive COVID-19 Test Counts

The following are expoentiated coefficient results for unadjusted single covariate models of associations with positive COVID-19 test counts.

3.1 Population Density

formulaCOV1 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(POPDEN2010)
resultCOV1 <- inla(formulaCOV1,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCOV1)
# resultCOV1$dic

# exp(resultCOV1$summary.fixed)

3.2 Median Household Income

formulaCOV2 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(MHI)
resultCOV2 <- inla(formulaCOV2,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCOV2)
# resultCOV1$dic

# exp(resultCOV2$summary.fixed)

3.3 School Density

formulaCOV3 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(schDen)
resultCOV3 <- inla(formulaCOV3,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCOV3)
# resultCOV1$dic

# exp(resultCOV3$summary.fixed)

3.4 Population Older Than 65

formulaCOV4 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(propOld)
resultCOV4 <- inla(formulaCOV4,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCOV4)
# resultCOV1$dic

# exp(resultCOV4$summary.fixed)

3.5 Asian Population

formulaCOV5 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(propAsian)
resultCOV5 <- inla(formulaCOV5,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCOV5)
# resultCOV1$dic

# exp(resultCOV5$summary.fixed)

3.6 Housing density

formulaCOV6 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(houseDensity1)
resultCOV6 <- inla(formulaCOV6,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCOV6)
# resultCOV1$dic

# exp(resultCOV6$summary.fixed)

3.7 Congdon Index

formulaCOV7 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ congdonIndex
resultCOV7 <- inla(formulaCOV7,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCOV7)
# resultCOV1$dic

# exp(resultCOV7$summary.fixed)

3.8 Language

formulaCOV8 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(nonEnglish)
resultCOV8 <- inla(formulaCOV8,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCOV7)
# resultCOV1$dic

# exp(resultCOV8$summary.fixed)

3.9 Proportion of Black Residents

formulaCOV9 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(propBlack)
resultCOV9 <- inla(formulaCOV9,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCOV9)
# resultCOV1$dic

# exp(resultCOV9$summary.fixed)

3.10 Proportion of Hispanic Residents

formulaCOV10 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(propPublicAssist2)
resultCOV10 <- inla(formulaCOV10,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCOV10)
# resultCOV1$dic

# exp(resultCOV10$summary.fixed)

3.11 Heart Disease

formulaCOV11 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(heartDisease)
resultCOV11 <- inla(formulaCOV11,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCOV11)
# resultCOV1$dic

# exp(resultCOV11$summary.fixed)

3.12 COPD

formulaCOV12 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(COPD)
resultCOV12 <- inla(formulaCOV12,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# summary(resultCOV12)
# resultCOV1$dic

# exp(resultCOV12$summary.fixed)

3.13 Summary Table of Univariate Associations

The results of the single covariate models of associations with positive COVID-19 test counts are summarized below

summaryUnivariate<-data.frame(rbind(
  round(exp(resultCOV1$summary.fixed[2,c(1,3,5)]),1),
  round(exp(resultCOV2$summary.fixed[2,c(1,3,5)]),1),
  round(exp(resultCOV3$summary.fixed[2,c(1,3,5)]),1),
  round(exp(resultCOV4$summary.fixed[2,c(1,3,5)]),1),
  round(exp(resultCOV5$summary.fixed[2,c(1,3,5)]),1),
  round(exp(resultCOV6$summary.fixed[2,c(1,3,5)]),1),
  round(exp(resultCOV7$summary.fixed[2,c(1,3,5)]),1),
  round(exp(resultCOV8$summary.fixed[2,c(1,3,5)]),1),
  round(exp(resultCOV9$summary.fixed[2,c(1,3,5)]),1),
  round(exp(resultCOV10$summary.fixed[2,c(1,3,5)]),1),
  round(exp(resultCOV11$summary.fixed[2,c(1,3,5)]),1),
  round(exp(resultCOV12$summary.fixed[2,c(1,3,5)]),1)
))

names(summaryUnivariate)<-c("IDR", "2.5%", "97.5%")
summaryUnivariate$Model<-c("Population Density", "Median Household Income", "School Density", "Older than 65", "Asian", "Housing Density", "Congdon Index", "Language", "Black", "Hispanic", "Heart Disease", "COPD")
summaryUnivariate<-summaryUnivariate[,c(4,1,2,3)]

regulartable(summaryUnivariate)

4 Multivariable Model: COPD, Heart Disease, Black, Housing Density, Age

formulaMultivar1 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(COPD) + scale(heartDisease) +scale(propBlack)+scale(propOld)+scale(houseDensity1)
resultMultivar1 <- inla(formulaMultivar1,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))

# stargazer(resultMultivar1)

# summary(resultMultivar1)
# resultCOV1$dic

# exp(resultMultivar1$summary.fixed)

multiVarMod<-data.frame(round(exp(resultMultivar1$summary.fixed)[,c(1,3,5)],2))

names(multiVarMod)<-c("IDR", "2.5%", "97.5%")

multiVarMod$Variable<-c("Intercept","COPD", "Heart Disease", "Black", "Older than 65", "Housing Density" )

multiVarMod<-multiVarMod[,c(4,1,2,3)]

rownames(multiVarMod)<-1:nrow(multiVarMod)

regulartable(multiVarMod)

5 Data Preparation

5.1 Covariates

5.1.1 Population, Age, Race, School Density, Population Density, MHI, Public Assistance

Covariate data at the ZCTA level obtained from US Census and from the Institute for Social Research

covariates<-read.csv("~/zcta_varsFIN.csv",header=T, stringsAsFactors=F)
str(covariates)

# select variables
# schDen (School Density (schools per square kilometer))
# POP2010 2010  Population: Total
# AG65UP  2010  Population: 65 years and older
# BLK 2010  Population: Black or African American alone
# ASN 2010  Population: Asian alone
# POPDEN2010  2010  Population Density (persons per square kilometer)
# HOUS2000  2000  Public Assistance Income in 1999 for Households - Total
# POPLNGOTH population older than 5 speaking other than English (to be used with POPTOT5 (population older than 5) to calculate proportion )

covariates2<-covariates[,c("zcta10","schDen","POP2010","AG65UP","BLK","ASN","POPDEN2010","HOUS2000", "POPTOT5", "POPLNGOTH")]
covariates2$zcta<-substr(covariates2$zcta10,3,7) # fix ZCTA variable
covariates2$nonEnglish<-covariates2$POPLNGOTH/covariates2$POPTOT5
covariates2$propBlack<-covariates2$BLK/covariates2$POP2010
covariates2$propOld<-covariates2$AG65UP/covariates2$POP2010
covariates2$propAsian<-covariates2$ASN/covariates2$POP2010
covariates2$nonEnglish<-covariates2$POPLNGOTH/covariates2$POPTOT5
covariates2$propAssistance<-covariates2$HOUS2000/covariates2$POP2010

5.1.2 Median Household Income

MHI<-read.csv("~/zctaMHI.csv",header=T, stringsAsFactors=F)
names(MHI)[1]<-"zcta"
MHI$zcta<-as.character(MHI$zcta)
MHI<-MHI[,-3] # remove POP variable
covariates3<-merge(MHI, covariates2, by="zcta", all.x=F, all.y=T)

5.1.3 Housing Density

Housing density and area in square miles at ZCTA level can be created from the below code taken from “spatialEpiModels.Rnw”. Created two congdonZCTA densiity measures houseDensity1 is the simple rate of number of houses per square mile. houseDensity2 is this proportion divided by the total number of people in a ZCTA

congdonZCTA<-read.table(file="~/censusPopHousing.txt", header=F, sep=",", skip=2, col.names=c("x1", "zcta", "x2", "x3","x4", "house.tot", "pop.tot", "house.rent", "house.occ"), colClasses=c("character", "character", "character", "character","character", "numeric", "numeric","numeric", "numeric"))
congdonZCTA<-congdonZCTA[,-c(1,3:5)]
congdonZCTA$zcta<-as.numeric(congdonZCTA$zcta)
complete<-complete.cases(congdonZCTA)
congdonZCTA<-congdonZCTA[complete,]

area<-read.table(file="~/censusSqMiles.txt", header=T)
area<-area[,-c(2,3,5:7)]
str(area)

# merge
congdonZCTA2<-merge(congdonZCTA, area, by="zcta", all.x=T, all.y=F)
congdonZCTA2<-congdonZCTA2[,-3] # remove tot.pop variable
congdonZCTA2$houseDensity1<-congdonZCTA2$house.tot/congdonZCTA2$sq.miles

# merge to covariate data
covariates4<-merge(covariates3, congdonZCTA2, by="zcta", all.x=T, all.y=F)
covariates4$houseDensity2<-covariates4$houseDensity1/covariates4$POP2010

5.1.4 Congdon Index

Create a variable based on Peter Congdon’s social fragmentation index ¹ It combines 4 variables extracted from US census variables: the proportion of total congdonZCTA units that are not owner occupied, the proportion of vacant congdonZCTA units, the proportion of individuals living alone, and the proportion of congdonZCTA units into which someone recently moved. Based on Census definitions, a “recent” move is defined as anytime in the previous 9 years (since the last decennial census). As per the Congdon paper, the variables are standardized, and added with equal weight.

# get variables for Congdon Index at Census Tract level
library(UScensus2010tract)
data(new_york.tract10)
# extract congdonZCTA variables for social fragmentation index from 
       #UScensus2010tract spatial polygon file
congdonZCTA<-attr(new_york.tract10, "data")
congdonZCTA<-subset(congdonZCTA, select=c(fips,H0010001,H0180002,H0180019,H0210001))
names(congdonZCTA)<-c("tract", "totHousing", "ownerHousing", 
          "aloneHousing", "vacantHousing")
# load pedDat from SRTS files
load(file="~/srtsINLA.RData")
congdonZCTA<-congdonZCTA[congdonZCTA$tract %in% pedDat$tract,]

# extract recent movers from downloaded US census file
recent<-read.csv("~/housingNYStracts.csv",header=T, stringsAsFactors=F)
recent<-subset(recent, select=c(GEO.id2, HC01_VC72))
names(recent)<-c("tract", "recentMove")

# merge congdonZCTA and recent movers files
congdonZCTA<-merge(congdonZCTA, recent, by="tract", all.x=T)
congdonZCTA$recentMove<-as.numeric(congdonZCTA$recentMove)
# set missing congdonZCTA data to zero
congdonZCTA$recentMove[is.na(congdonZCTA$recentMove)]<-0 


# load crosswalk file of NYC tract to ZCTA level
xWalk<-read.csv("~/nycTract2ZCTA.csv",header=T, stringsAsFactors=F)

# format the tract variables for merging

# congdonZCTA variable is 11 character string first 2 digits state, next 3 are county, final 6 are census tract, e.g. "36005000100" "36005000200"
congdonZCTA$tract2<-substr(congdonZCTA$tract,6,11)

# xWalk variable is 7 characters with a "." separating the last 2 digits, which needs to be removed
xWalk$tract2<-gsub("[.]","",xWalk$tract)

# merge congdonZCTA and xWalk files
congdonZCTA1<-merge(congdonZCTA, xWalk, by="tract2", all.x=T, all.y=F)

# sum up the Congdon elements by ZCTA
congdonZCTA2<-aggregate(. ~ congdonZCTA1$zcta5, data=congdonZCTA1[,3:7], FUN=sum)
names(congdonZCTA2)[1]<-"zcta"


# CONGDON CALCULATIONS


# calculate proportions based on total congdonZCTA units
# avoid division by zero
congdonZCTA2$owner<-congdonZCTA2$ownerHousing/(congdonZCTA2$totHousing+.1) 
congdonZCTA2$alone<-congdonZCTA2$aloneHousing/(congdonZCTA2$totHousing+.1)
congdonZCTA2$vacant<-congdonZCTA2$vacantHousing/(congdonZCTA2$totHousing+.1)
congdonZCTA2$notOwner<-1-congdonZCTA2$owner
congdonZCTA2$recent<-congdonZCTA2$recentMove/(congdonZCTA2$totHousing+.1)


# standardize proportions
congdonZCTA2$aloneST<-
(congdonZCTA2$alone-mean(congdonZCTA2$alone, na.rm=T))/sd(congdonZCTA2$alone, na.rm=T)
summary(congdonZCTA2$aloneST)

congdonZCTA2$vacantST<-
(congdonZCTA2$vacant-mean(congdonZCTA2$vacant, na.rm=T))/sd(congdonZCTA2$vacant, na.rm=T)
summary(congdonZCTA2$vacantST)

congdonZCTA2$notOwnerST<-
(congdonZCTA2$notOwner-mean(congdonZCTA2$notOwner, na.rm=T))/sd(congdonZCTA2$notOwner, na.rm=T)
summary(congdonZCTA2$notOwnerST)

congdonZCTA2$recentST<-
(congdonZCTA2$recent-mean(congdonZCTA2$recent, na.rm=T))/sd(congdonZCTA2$recent, na.rm=T)
summary(congdonZCTA2$recentST)

# add standardized measures into single index
congdonZCTA2$index<-congdonZCTA2$aloneST+congdonZCTA2$vacantST+congdonZCTA2$notOwnerST+congdonZCTA2$recentST
summary(congdonZCTA2$index)

plot(density(congdonZCTA2$index))
quantile(congdonZCTA2$index, probs=c(0.05,0.95)) 
#        5%       95% 
# -3.056998  3.944542 


# save the ZCTA-level Congdon Index file

saveRDS(congdonZCTA2, "~/nycZCTACongdon.RDS")

congdonZCTA2<-readRDS("~/nycZCTACongdon.RDS")

# merge covariates4 and congdon

# restrict congdon file to just index and zcta
congdon<-congdonZCTA2[,c("zcta","index")]
covariates5<-merge(covariates4, congdon, by="zcta", all.x=T, all.y=F)
str(covariates5)
names(covariates5)[24]<-"congdonIndex"

# NB: can get county and zip names from xWalk file and merge to the covariate file if needed (only helpful for queens), but difficult to merge to the covariate file, need to remove duplicate zcta's
# zipNames<-xWalk[-1,c("zcta5", "cntyname", "zipname")]
# names(zipNames)[1]<-"zcta"

# merge to covariates file
# covariates6<-merge(covariates5, zipNames, by="zcta", all.x=T, all.y=F)

# save the covariates file

5.2 Add Percent Hispanic and Percent Public Assistance from NYC OpenData

Downloaded updated data from New York City OpenData portal.

newDat1<-read.csv("~/Demographic_Statistics_By_Zip_Code.csv",header=T, stringsAsFactors=F)

newDat1$JURISDICTION.NAME<-as.character(newDat1$JURISDICTION.NAME)

newDat2<-newDat1[,c("JURISDICTION.NAME","PERCENT.HISPANIC.LATINO","PERCENT.US.CITIZEN","PERCENT.RECEIVES.PUBLIC.ASSISTANCE" )]

names(newDat2)<-c("zcta", "propHisp", "propCitizen", "propPublicAssist2")

covariates6<-merge(covariates5, newDat2, by="zcta", all.x=T, all.Y=F)

5.3 Health Metrics

Downloaded shapefile data on ZCTA-level cardiovascular and diabetes from Simply Analytics App via NYU Libraries

library(foreign)

# heart disease
shapefile1<-read.dbf("/~/SimplyAnalytics_Shapefiles_2020-04-19_15_07_55_9a48bb6f434ea2cdb7c907ec71313405.dbf")

str(shapefile1)
# variable_names.txt
# VALUE0  # Population, 2019
# VALUE1  % MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | HEART DISEASE/CONGESTIVE HEART FAILURE, 2019

names(shapefile1)[1]<-"zcta"
names(shapefile1)[4]<-"heartDisease"

heartDisease<-shapefile1[,c(1,4)]
head(heartDisease)
heartDisease$heartDisease[is.na(heartDisease$heartDisease)]<-0.001


covariates7<-merge(covariates6,heartDisease, by="zcta", all.x=T, all.Y=F)


# Other health metrics

# HTN, CAD, Emphysema
shapefile2<-read.dbf("~/SimplyAnalytics_Shapefiles_2020-04-19_15_12_56_576cfa55f705a769f6bfe050f89469ea.dbf")
str(shapefile2)
# VALUE0  # MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | HYPERTENSION/HIGH BLOOD PRESSURE, 2019
# VALUE1  % MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | HEART DISEASE/CONGESTIVE HEART FAILURE, 2019
# VALUE2  % MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | EMPHYSEMA, 2019
# VALUE3  # MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | HEART ATTACK/STROKE, 2019

# DM, COPD
shapefile3<-read.dbf("~/SimplyAnalytics_Shapefiles_2020-04-19_15_20_02_0fac73e47dd32eb23d46fa3821630f62.dbf")
str(shapefile3)
# VALUE0  # MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | DIABETES TYPE 1, 2019
# VALUE1  # MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | DIABETES TYPE 2, 2019
# VALUE2  # MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | COPD (CHRONIC OBSTRUCTIVE PULMONARY DIS), 2019

names(shapefile3)[1]<-"zcta"
names(shapefile3)[5]<-"COPD"

COPD<-shapefile3[,c(1,5)]
COPD$COPD[is.na(COPD$COPD)]<-0.001

covariates8<-merge(covariates7,COPD, by="zcta", all.x=T, all.Y=F)

saveRDS(covariates8, "/Users/charlie/OneDrive - NYU Langone Health/covidINLA/covidNYCZCTA_covariates.RDS")

5.4 Covid Data

NYC DOHMH releasing COVID19 positive tests data on GitHub Files are updated every 2 days or so. Have to track down previous files by clicking on “History”, selecting the date, then clicking the three-dot elipsis to access the link to the csv for that date.

5.4.1 Read in and Save Recent Total Data

dat422<-read.csv("~/tests-by-zcta.csv",header=T, stringsAsFactors=F) # 4/22 data

# recentDat<-read.csv("https://github.com/nychealth/coronavirus-data/raw/master/tests-by-zcta.csv",header=T, stringsAsFactors=F)

saveRDS(dat422, "~/covidNYCZCTA_422.RDS")

5.5 Merge to Create Analysis Data Set

Merge the covariates data to the most recent total testing data.

covariates<-readRDS("~/covidNYCZCTA_covariates.RDS")

dat1<-readRDS("/Users/charlie/OneDrive - NYU Langone Health/covidINLA/covidNYCZCTA_422.RDS")
names(dat1)[1]<-"zcta"

testDat<-merge(dat1, covariates, by="zcta", all.x=T, all.y=F )

# calculate rate positive per 10,000 population
testDat$rate<-testDat$Positive/testDat$POP2010*10000

# calcuate proportion positive per 10,000 tests
testDat$prop<-testDat$Positive/testDat$Total*10000

saveRDS(testDat, "~/covidINLA/testDat.RDS")

5.6 Spatial Data

Read in spatial file of NYC ZCTAs. Index the map file to the test data file to restrict to ZCTAs with valid data entries.

nyc <- readShapePoly("~/nycZCTA/tl_2010_36_zcta510NYC.shp")

valid<-nyc$ZCTA5CE10 %in% testDat$ZCTA
nyc<-nyc[valid,]
plot(nyc)

# save as an RDS object

saveRDS(nyc, "~/nycMap4COVID.RDS")

Create an adjacency matrix “nb” object from the map file using . Manually edit the matrix to create adjacencies between boros. First, create the adjacency matrix “nb” object from the map file using . Then use utility to manually edit the nb object. Interactively click on map to select two areas to connect, enter “y” to connect them, enter “c” to continue. Save the edited adjacency object. Plot the default and the edited adjacency matrices.

zzz <- poly2nb(nyc)

nnb1<-edit.nb(zzz, polys=nyc)

# added contiguities between points
# 67 and 74 
# 66 and 142 
# 25 and 161 
# 34 and 137 
# 44 and 122 
# 13 and 131 
# 52 and 83 
# 12 and 93 
# 4 and 148 
# 157 and 175 
# 51 and 55 


# extract coordinates from shape file to plot adjacency matrix
nyc2<-st_read("~/tl_2010_36_zcta510NYC.shp")
nyc2<-nyc2[valid,]
coords <- st_coordinates(st_centroid(st_geometry(nyc2)))

# plot default ajdacency matrix
jpeg('~/adjacencyMatrixPlot1.jpg')
plot(nyc)
plot.nb(zzz, coords, add=T)
dev.off()

# plot edited matrix
# plot
jpeg('~/adjacencyMatrixPlot2.jpg')
plot(nyc)
plot.nb(nnb1, coords, add=T)
dev.off()

# save the nnb object
saveRDS(nnb1, "~/nnb1.RDS")

After correcting the nb object, use to get the nb object into the correct format for INLA, saving the resulting object for later use in models in the covidINLA directory. Set up a path to it saved as an object named “nyc.adj”

nb2INLA("/Users/charlie/OneDrive - NYU Langone Health/covidINLA/nyc.graph", nnb1)
nyc.adj <- paste("/Users/charlie/OneDrive - NYU Langone Health/covidINLA/nyc.graph")
file.show(nyc.adj)

Peter Congdon. Suicide and Parasuicide in London: A Small-area Study. Urban Studies, Vol. 33, No. 1, 137-158, 1996 and Roman Pabayo, Beth E. Molnar, Nancy Street, and Ichiro Kawachi. The relationship between social fragmentation and sleep among adolescents living in Boston, Massachusetts. Journal of Public Health pp 1-12 J Public Health (2014) doi: 10.1093/pubmed/fdu001↩︎

Code for Bayesian Hierarchical Spatial Modeling of New York City ZIP Code-Level Positive COVID19 Testing Results

Charles DiMaggio, PhD, MPH

Last update: 08 July, 2020