library(maptools, quietly = T)
library(sp)
library(spdep, quietly = T)
library(INLA, quietly = T)
library(ggplot2)
# library(kableExtra)
library(flextable)
testDat<-readRDS("/Users/charlie/OneDrive - NYU Langone Health/covidINLA/testDat.RDS")
nyc <- readRDS("/Users/charlie/OneDrive - NYU Langone Health/covidINLA/nycMap4COVID.RDS")
nycDat <- attr(nyc, "data")
order <- match(nycDat$ZCTA5CE10,testDat$zcta)
testDat<- testDat[order,]
testDat$ID.area<-seq(1,nrow(testDat))
nyc.adj <- paste("/Users/charlie/OneDrive - NYU Langone Health/covidINLA/nyc.graph")
library(DescTools)
options(scipen = 10) # don't use scientific notation
options(fmt.num=structure(list(digits=2, big.mark=","), class="fmt")) # force use commas as thousands separator and 2 decimal places
Desc(testDat$rate, main="Positive COVID-19 Tests per 10,000 ZCTA Population")
Desc(testDat$prop, main="Positive COVID-19 Tests per 10,000 ZCTA Tests")
# ZCTA's with highest and lowest rate and proportions.
testDat$zcta[testDat$rate %in% head(testDat$rate)]
testDat$zcta[testDat$rate %in% tail(testDat$rate)]
testDat$zcta[testDat$prop %in% head(testDat$prop)]
testDat$zcta[testDat$prop %in% tail(testDat$prop)]
# table highest and lowest quantiles
cut.rate<-quantile(testDat$rate,probs = seq(0, 1, by = 0.20))
testDat$rate.cut<-cut(testDat$rate,breaks=cut.rate,include.lowest=TRUE)
testDat$rate.tab<-NA
testDat$rate.tab[testDat$rate.cut=="[42,105]"]<-"Low"
testDat$rate.tab[testDat$rate.cut=="(222,323]"]<-"High"
tab1<-data.frame(TOne(testDat[,c("MHI","schDen","POPDEN2010","houseDensity1","congdonIndex","propBlack","propOld","propHisp","heartDisease","COPD")], grp=testDat[,c("rate.tab")], vnames=c("MHI", "School Density", "Population Density", "Housing Density", "Congdon Index", "Black", "Hispanic", "Heart Disease", "COPD"))[1:10,1:5])
regulartable(data.frame(tab1[1:10,]), cwidth=1.2)
#Choropleths of Spatial Risk
We start with a simple choropleth of postive COVID-19 tests per 10,000 ZCTA population.
cut.rate<-quantile(testDat$rate,probs = seq(0, 1, by = 0.20))
testDat$rate.cut<-cut(testDat$rate,breaks=cut.rate,include.lowest=TRUE)
# summary(testDat$rate.cut)
nyc.df <- fortify(nyc, region='ZCTA5CE10')
nyc.df <- merge(nyc.df, testDat, by.x="id", by.y="zcta", all=FALSE)
nyc.df <- nyc.df[order(nyc.df$group, nyc.df$order), ]
p1<-ggplot(nyc.df, aes(x=long, y=lat))
p2<-p1+geom_polygon(aes(group=group, fill=rate.cut))
p3<- p2 + scale_fill_brewer(palette="Blues", name="Fitted Values")
p4<- p3+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(),
axis.ticks = element_blank()) +
theme(panel.background = element_rect(colour = NA)) +
xlab("")+ylab("")+
ggtitle("Positive COVID19 Tests per 10,000 Population,
New York City ZIP Code Tabulation Areas April 3-22, 2019")
print(p4)
Next, we map the proportion of positive COVID-19 tests per 10,000 positive tests.
cut.prop<-quantile(testDat$prop,probs = seq(0, 1, by = 0.20))
testDat$prop.cut<-cut(testDat$prop,breaks=cut.prop,include.lowest=TRUE)
# summary(testDat$prop.cut)
nyc.df <- fortify(nyc, region='ZCTA5CE10')
nyc.df <- merge(nyc.df, testDat, by.x="id", by.y="zcta", all=FALSE)
nyc.df <- nyc.df[order(nyc.df$group, nyc.df$order), ]
p1<-ggplot(nyc.df, aes(x=long, y=lat))
p2<-p1+geom_polygon(aes(group=group, fill=prop.cut))
p3<- p2 + scale_fill_brewer(palette="Blues", name="Fitted Values (Quantiles)", labels= c("2590, 4360", "4360, 5090", "5090, 5640", "5640, 5890", "5890, 7330" ))
p4<- p3 + theme(axis.text.x = element_blank(), axis.text.y = element_blank(),
axis.ticks = element_blank()) +
theme(panel.background = element_rect(colour = NA)) +
xlab("Latitude (74.2179 W)")+ylab("Longitude (43.2994 N)")+
ggtitle("Positive COVID19 Tests per 10,000 Tests,
New York City ZIP Code Tabulation Areas April 3-22, 2019")+ theme(plot.title = element_text(hjust = 0.5))
print(p4)
ggsave("/Users/charlie/OneDrive\ -\ NYU\ Langone\ Health/covidINLA/covidINLAwriteup/covidINLA_annalsEpi/covidINLA_annalsEpiRev1/covidINLA_annalsRev1FIG2.jpg", p4)
The first (frailty) model illustrates the random effect term, with no explicit spatial component.
formulaUH <- Positive~ f(ID.area, model = "iid", offset(Total)) # specify model
resultUH <- inla(formulaUH,family="poisson", data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultUH)
# exp(resultUH$summary.fixed)
# mean(testDat$Positive, na.rm=T) # model results differ because adjusted for population
The line graph plots the density of the random effect term around the mean value restricted on some of the negative values to better illustrated the (approximately) normal distribution of the random effects.
re1<-resultUH$summary.random$ID.area[,2]
# plot(density(re1)) # plot density random effects
plot(density(re1[re1>-2e-04]), main="Density Plot Random Effects Frailty Model") # restrict
We next map the random effects term to illustrate the distribution of the unstructured heterogeneity or variance. The results (as expected) appear generally as a spatially random distribution of variance above and below the mean value.
# add random effects results to dataframe
testDat$UH<-re1
# create cuts
cuts <-quantile(testDat$UH,probs = seq(0, 1, by = 0.20))
testDat$UH.cut<-cut(testDat$UH,breaks=cuts,include.lowest=TRUE)
#merge dataframes, save to data slot of map
testDat$ZCTA5CE10<-testDat$zcta
attr(nyc,"data") <- merge(nycDat,testDat,by="ZCTA5CE10")
nyc.df <- fortify(nyc, region='ZCTA5CE10')
nyc.df <- merge(nyc.df, testDat, by.x="id", by.y="zcta", all=FALSE)
nyc.df <- nyc.df[order(nyc.df$group, nyc.df$order), ]
library(RColorBrewer)
p1<-ggplot(nyc.df, aes(x=long, y=lat))
p2<-p1+geom_polygon(aes(group=group, fill=UH.cut))
p3<- p2 + scale_fill_brewer(palette="Blues", name="Random\nEffects")
p4<- p3+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(),
axis.ticks = element_blank()) +
theme(panel.background = element_rect(colour = NA)) +
xlab("")+ylab("")+
ggtitle("Unstructured Heterogeneity (Above and Below Mean), Positive COVID19 Tests,
New York City ZIP Code Tabulation Areas April 3-8, 2019")
print(p4)
The convolution model adds a spatially-structured conditional autoregression term to the spatially-unstructured heterogeneity random effect term of the frailty model.
formulaCAR <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)
resultCAR <- inla(formulaCAR,family="poisson", data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCAR)
# exp(resultCAR$summary.fixed)
re<-resultCAR$summary.random$ID.area[1:length(testDat$zcta),2]
# plot(density(re[re>-0.002]), main="Density Plot Random Effects Convolution Model") # plot density random effects
car<-resultCAR$summary.random$ID.area[(length(testDat$zcta)+1):(2*length(testDat$ZCTA)),2]
# plot(density(car[car>-0.0005]),main="Density Plot Spatial Effects Convolution Model") # plot density spatial effects
We next map the results of the spatial heterogeneity, demonstrating the variance values above and below zero,which appears to have greater spatial structure than in the simple frailty model, with some ZCTAs demonstrating higher variance estimates than others.
# add car results to dataframe
testDat$car<-car
# create cuts
cuts <- quantile(testDat$car,probs = seq(0, 1, by = 0.20))
summary(cuts)
testDat$car.cut<-cut(testDat$car,breaks=cuts, include.lowest=TRUE, include.highest=TRUE)
nyc.df <- fortify(nyc, region='ZCTA5CE10')
nyc.df <- merge(nyc.df, testDat, by.x="id", by.y="zcta", all=FALSE)
nyc.df <- nyc.df[order(nyc.df$group, nyc.df$order), ]
# library(RColorBrewer)
p1<-ggplot(nyc.df, aes(x=long, y=lat))
p2<-p1+geom_polygon(aes(group=group, fill=car.cut))
p3<- p2 + scale_fill_brewer(palette="Blues", name="Spatial\nEffects")
p4<- p3+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(),
axis.ticks = element_blank()) +
theme(panel.background = element_rect(colour = NA)) +
xlab("")+ylab("")+
ggtitle("Spatially Structured (CAR) Heterogeneity,Positive COVID19 Tests,
New York City ZIP Code Tabulation Areas April 3-22, 2019")
print(p4)
Using calcuations from the fitted convolution model, we first map the so-called fitted variance values, which are the sum of the mean value, plus the unstructured and spatially structured variance components (\(\theta = \alpha + \upsilon + \nu\)).
#risk (theta = alpha + upsilon + nu)
CARfit <- resultCAR$summary.fitted.values[,1]
testDat$fit<-CARfit
cut.fit<-quantile(testDat$fit,probs = seq(0, 1, by = 0.20))
testDat$fit.cut<-cut(testDat$fit,breaks=cut.fit,include.lowest=TRUE)
# summary(testDat$fit.cut)
nyc.df <- fortify(nyc, region='ZCTA5CE10')
nyc.df <- merge(nyc.df, testDat, by.x="id", by.y="zcta", all=FALSE)
nyc.df <- nyc.df[order(nyc.df$group, nyc.df$order), ]
p1<-ggplot(nyc.df, aes(x=long, y=lat))
p2<-p1+geom_polygon(aes(group=group, fill=fit.cut))
p3<- p2 + scale_fill_brewer(palette="Blues", name="Fitted Values")
p4<- p3+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(),
axis.ticks = element_blank()) +
theme(panel.background = element_rect(colour = NA)) +
xlab("")+ylab("")+
ggtitle("Fitted Model Values, Heterogeneity,Positive COVID19 Tests,
New York City ZIP Code Tabulation Areas April 3-22, 2019")
print(p4)
Next, the spatial risk estimate is calculated as the sum of the unstructured and spatially structured variance components ((\(\zeta = \upsilon + \nu\)))
formulaCAR <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)
resultCAR <- inla(formulaCAR,family="poisson", data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
#spatial risk (zeta = upsilon + nu)
CARmarginals <- resultCAR$marginals.random$ID.area[1:177]
CARzeta <- lapply(CARmarginals,function(x)inla.emarginal(exp,x)) # exponentiate
testDat$risk<-unlist(CARzeta)
cut.risk<-quantile(testDat$risk, probs = seq(0, 1, by = 0.20))
testDat$risk.cut<-cut(testDat$risk,breaks=cut.risk,include.lowest=TRUE)
testDat$risk.cut2<-cut(testDat$risk,breaks=c(0, 1, 1.001, 1.002, 1.003))
# summary(testDat$risk.cut)
nyc.df <- fortify(nyc, region='ZCTA5CE10')
nyc.df <- merge(nyc.df, testDat, by.x="id", by.y="zcta", all=FALSE)
nyc.df <- nyc.df[order(nyc.df$group, nyc.df$order), ]
p1<-ggplot(nyc.df, aes(x=long, y=lat))
p2<-p1+geom_polygon(aes(group=group, fill=risk.cut2))
p3<- p2 + scale_fill_brewer(palette="Blues", name="Spatial\nRisk")
p4<- p3+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(),
axis.ticks = element_blank()) +
theme(panel.background = element_rect(colour = NA)) +
xlab("Latitude (74.2179 W)")+ylab("Longitude (43.2994 N)") +
ggtitle("Spatially Structured Risk Variance Estimates, Positive COVID19 Tests,
New York City ZIP Code Tabulation Areas April 3-22, 2019")+ theme(plot.title = element_text(hjust = 0.5))
print(p4)
ggsave("/Users/charlie/OneDrive\ -\ NYU\ Langone\ Health/covidINLA/covidINLAwriteup/covidINLA_annalsEpi/covidINLA_annalsEpiRev1/covidINLA_annalsRev1FIG3.jpg", p4)
Finally, we estimate the proportion of the variance explained by geographic variation or place, which for this model is approximately 32%
mat.marg <- matrix(NA, nrow=177, ncol=1000)
m<-resultCAR$marginals.random$ID.area
for (i in 1:177){
u<-m[[177+i]]
s<-inla.rmarginal(1000, u)
mat.marg[i,]<-s}
var.RRspatial<-mean(apply(mat.marg, 2, sd))^2
var.RRhet<-inla.emarginal(function(x) 1/x,
resultCAR$marginals.hyper$"Precision for ID.area (iid component)")
# var.RRspatial/(var.RRspatial+var.RRhet) # 0.3200823
The following are expoentiated coefficient results for unadjusted single covariate models of associations with positive COVID-19 test counts.
formulaCOV1 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(POPDEN2010)
resultCOV1 <- inla(formulaCOV1,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCOV1)
# resultCOV1$dic
# exp(resultCOV1$summary.fixed)
formulaCOV2 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(MHI)
resultCOV2 <- inla(formulaCOV2,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCOV2)
# resultCOV1$dic
# exp(resultCOV2$summary.fixed)
formulaCOV3 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(schDen)
resultCOV3 <- inla(formulaCOV3,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCOV3)
# resultCOV1$dic
# exp(resultCOV3$summary.fixed)
formulaCOV4 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(propOld)
resultCOV4 <- inla(formulaCOV4,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCOV4)
# resultCOV1$dic
# exp(resultCOV4$summary.fixed)
formulaCOV5 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(propAsian)
resultCOV5 <- inla(formulaCOV5,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCOV5)
# resultCOV1$dic
# exp(resultCOV5$summary.fixed)
formulaCOV6 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(houseDensity1)
resultCOV6 <- inla(formulaCOV6,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCOV6)
# resultCOV1$dic
# exp(resultCOV6$summary.fixed)
formulaCOV7 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ congdonIndex
resultCOV7 <- inla(formulaCOV7,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCOV7)
# resultCOV1$dic
# exp(resultCOV7$summary.fixed)
formulaCOV8 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(nonEnglish)
resultCOV8 <- inla(formulaCOV8,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCOV7)
# resultCOV1$dic
# exp(resultCOV8$summary.fixed)
formulaCOV9 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(propBlack)
resultCOV9 <- inla(formulaCOV9,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCOV9)
# resultCOV1$dic
# exp(resultCOV9$summary.fixed)
formulaCOV10 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(propPublicAssist2)
resultCOV10 <- inla(formulaCOV10,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCOV10)
# resultCOV1$dic
# exp(resultCOV10$summary.fixed)
formulaCOV11 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(heartDisease)
resultCOV11 <- inla(formulaCOV11,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCOV11)
# resultCOV1$dic
# exp(resultCOV11$summary.fixed)
formulaCOV12 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(COPD)
resultCOV12 <- inla(formulaCOV12,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# summary(resultCOV12)
# resultCOV1$dic
# exp(resultCOV12$summary.fixed)
The results of the single covariate models of associations with positive COVID-19 test counts are summarized below
summaryUnivariate<-data.frame(rbind(
round(exp(resultCOV1$summary.fixed[2,c(1,3,5)]),1),
round(exp(resultCOV2$summary.fixed[2,c(1,3,5)]),1),
round(exp(resultCOV3$summary.fixed[2,c(1,3,5)]),1),
round(exp(resultCOV4$summary.fixed[2,c(1,3,5)]),1),
round(exp(resultCOV5$summary.fixed[2,c(1,3,5)]),1),
round(exp(resultCOV6$summary.fixed[2,c(1,3,5)]),1),
round(exp(resultCOV7$summary.fixed[2,c(1,3,5)]),1),
round(exp(resultCOV8$summary.fixed[2,c(1,3,5)]),1),
round(exp(resultCOV9$summary.fixed[2,c(1,3,5)]),1),
round(exp(resultCOV10$summary.fixed[2,c(1,3,5)]),1),
round(exp(resultCOV11$summary.fixed[2,c(1,3,5)]),1),
round(exp(resultCOV12$summary.fixed[2,c(1,3,5)]),1)
))
names(summaryUnivariate)<-c("IDR", "2.5%", "97.5%")
summaryUnivariate$Model<-c("Population Density", "Median Household Income", "School Density", "Older than 65", "Asian", "Housing Density", "Congdon Index", "Language", "Black", "Hispanic", "Heart Disease", "COPD")
summaryUnivariate<-summaryUnivariate[,c(4,1,2,3)]
regulartable(summaryUnivariate)
formulaMultivar1 <- Positive~ f(ID.area, model="bym", offset(Total), graph=nyc.adj, adjust.for.con.comp = FALSE)+ scale(COPD) + scale(heartDisease) +scale(propBlack)+scale(propOld)+scale(houseDensity1)
resultMultivar1 <- inla(formulaMultivar1,family="poisson",data=testDat, control.compute=list(dic=TRUE,cpo=TRUE))
# stargazer(resultMultivar1)
# summary(resultMultivar1)
# resultCOV1$dic
# exp(resultMultivar1$summary.fixed)
multiVarMod<-data.frame(round(exp(resultMultivar1$summary.fixed)[,c(1,3,5)],2))
names(multiVarMod)<-c("IDR", "2.5%", "97.5%")
multiVarMod$Variable<-c("Intercept","COPD", "Heart Disease", "Black", "Older than 65", "Housing Density" )
multiVarMod<-multiVarMod[,c(4,1,2,3)]
rownames(multiVarMod)<-1:nrow(multiVarMod)
regulartable(multiVarMod)
Covariate data at the ZCTA level obtained from US Census and from the Institute for Social Research
covariates<-read.csv("~/zcta_varsFIN.csv",header=T, stringsAsFactors=F)
str(covariates)
# select variables
# schDen (School Density (schools per square kilometer))
# POP2010 2010 Population: Total
# AG65UP 2010 Population: 65 years and older
# BLK 2010 Population: Black or African American alone
# ASN 2010 Population: Asian alone
# POPDEN2010 2010 Population Density (persons per square kilometer)
# HOUS2000 2000 Public Assistance Income in 1999 for Households - Total
# POPLNGOTH population older than 5 speaking other than English (to be used with POPTOT5 (population older than 5) to calculate proportion )
covariates2<-covariates[,c("zcta10","schDen","POP2010","AG65UP","BLK","ASN","POPDEN2010","HOUS2000", "POPTOT5", "POPLNGOTH")]
covariates2$zcta<-substr(covariates2$zcta10,3,7) # fix ZCTA variable
covariates2$nonEnglish<-covariates2$POPLNGOTH/covariates2$POPTOT5
covariates2$propBlack<-covariates2$BLK/covariates2$POP2010
covariates2$propOld<-covariates2$AG65UP/covariates2$POP2010
covariates2$propAsian<-covariates2$ASN/covariates2$POP2010
covariates2$nonEnglish<-covariates2$POPLNGOTH/covariates2$POPTOT5
covariates2$propAssistance<-covariates2$HOUS2000/covariates2$POP2010
MHI<-read.csv("~/zctaMHI.csv",header=T, stringsAsFactors=F)
names(MHI)[1]<-"zcta"
MHI$zcta<-as.character(MHI$zcta)
MHI<-MHI[,-3] # remove POP variable
covariates3<-merge(MHI, covariates2, by="zcta", all.x=F, all.y=T)
Housing density and area in square miles at ZCTA level can be created from the below code taken from “spatialEpiModels.Rnw”. Created two congdonZCTA densiity measures houseDensity1
is the simple rate of number of houses per square mile. houseDensity2
is this proportion divided by the total number of people in a ZCTA
congdonZCTA<-read.table(file="~/censusPopHousing.txt", header=F, sep=",", skip=2, col.names=c("x1", "zcta", "x2", "x3","x4", "house.tot", "pop.tot", "house.rent", "house.occ"), colClasses=c("character", "character", "character", "character","character", "numeric", "numeric","numeric", "numeric"))
congdonZCTA<-congdonZCTA[,-c(1,3:5)]
congdonZCTA$zcta<-as.numeric(congdonZCTA$zcta)
complete<-complete.cases(congdonZCTA)
congdonZCTA<-congdonZCTA[complete,]
area<-read.table(file="~/censusSqMiles.txt", header=T)
area<-area[,-c(2,3,5:7)]
str(area)
# merge
congdonZCTA2<-merge(congdonZCTA, area, by="zcta", all.x=T, all.y=F)
congdonZCTA2<-congdonZCTA2[,-3] # remove tot.pop variable
congdonZCTA2$houseDensity1<-congdonZCTA2$house.tot/congdonZCTA2$sq.miles
# merge to covariate data
covariates4<-merge(covariates3, congdonZCTA2, by="zcta", all.x=T, all.y=F)
covariates4$houseDensity2<-covariates4$houseDensity1/covariates4$POP2010
Create a variable based on Peter Congdon’s social fragmentation index 1 It combines 4 variables extracted from US census variables: the proportion of total congdonZCTA units that are not owner occupied, the proportion of vacant congdonZCTA units, the proportion of individuals living alone, and the proportion of congdonZCTA units into which someone recently moved. Based on Census definitions, a “recent” move is defined as anytime in the previous 9 years (since the last decennial census). As per the Congdon paper, the variables are standardized, and added with equal weight.
# get variables for Congdon Index at Census Tract level
library(UScensus2010tract)
data(new_york.tract10)
# extract congdonZCTA variables for social fragmentation index from
#UScensus2010tract spatial polygon file
congdonZCTA<-attr(new_york.tract10, "data")
congdonZCTA<-subset(congdonZCTA, select=c(fips,H0010001,H0180002,H0180019,H0210001))
names(congdonZCTA)<-c("tract", "totHousing", "ownerHousing",
"aloneHousing", "vacantHousing")
# load pedDat from SRTS files
load(file="~/srtsINLA.RData")
congdonZCTA<-congdonZCTA[congdonZCTA$tract %in% pedDat$tract,]
# extract recent movers from downloaded US census file
recent<-read.csv("~/housingNYStracts.csv",header=T, stringsAsFactors=F)
recent<-subset(recent, select=c(GEO.id2, HC01_VC72))
names(recent)<-c("tract", "recentMove")
# merge congdonZCTA and recent movers files
congdonZCTA<-merge(congdonZCTA, recent, by="tract", all.x=T)
congdonZCTA$recentMove<-as.numeric(congdonZCTA$recentMove)
# set missing congdonZCTA data to zero
congdonZCTA$recentMove[is.na(congdonZCTA$recentMove)]<-0
# load crosswalk file of NYC tract to ZCTA level
xWalk<-read.csv("~/nycTract2ZCTA.csv",header=T, stringsAsFactors=F)
# format the tract variables for merging
# congdonZCTA variable is 11 character string first 2 digits state, next 3 are county, final 6 are census tract, e.g. "36005000100" "36005000200"
congdonZCTA$tract2<-substr(congdonZCTA$tract,6,11)
# xWalk variable is 7 characters with a "." separating the last 2 digits, which needs to be removed
xWalk$tract2<-gsub("[.]","",xWalk$tract)
# merge congdonZCTA and xWalk files
congdonZCTA1<-merge(congdonZCTA, xWalk, by="tract2", all.x=T, all.y=F)
# sum up the Congdon elements by ZCTA
congdonZCTA2<-aggregate(. ~ congdonZCTA1$zcta5, data=congdonZCTA1[,3:7], FUN=sum)
names(congdonZCTA2)[1]<-"zcta"
# CONGDON CALCULATIONS
# calculate proportions based on total congdonZCTA units
# avoid division by zero
congdonZCTA2$owner<-congdonZCTA2$ownerHousing/(congdonZCTA2$totHousing+.1)
congdonZCTA2$alone<-congdonZCTA2$aloneHousing/(congdonZCTA2$totHousing+.1)
congdonZCTA2$vacant<-congdonZCTA2$vacantHousing/(congdonZCTA2$totHousing+.1)
congdonZCTA2$notOwner<-1-congdonZCTA2$owner
congdonZCTA2$recent<-congdonZCTA2$recentMove/(congdonZCTA2$totHousing+.1)
# standardize proportions
congdonZCTA2$aloneST<-
(congdonZCTA2$alone-mean(congdonZCTA2$alone, na.rm=T))/sd(congdonZCTA2$alone, na.rm=T)
summary(congdonZCTA2$aloneST)
congdonZCTA2$vacantST<-
(congdonZCTA2$vacant-mean(congdonZCTA2$vacant, na.rm=T))/sd(congdonZCTA2$vacant, na.rm=T)
summary(congdonZCTA2$vacantST)
congdonZCTA2$notOwnerST<-
(congdonZCTA2$notOwner-mean(congdonZCTA2$notOwner, na.rm=T))/sd(congdonZCTA2$notOwner, na.rm=T)
summary(congdonZCTA2$notOwnerST)
congdonZCTA2$recentST<-
(congdonZCTA2$recent-mean(congdonZCTA2$recent, na.rm=T))/sd(congdonZCTA2$recent, na.rm=T)
summary(congdonZCTA2$recentST)
# add standardized measures into single index
congdonZCTA2$index<-congdonZCTA2$aloneST+congdonZCTA2$vacantST+congdonZCTA2$notOwnerST+congdonZCTA2$recentST
summary(congdonZCTA2$index)
plot(density(congdonZCTA2$index))
quantile(congdonZCTA2$index, probs=c(0.05,0.95))
# 5% 95%
# -3.056998 3.944542
# save the ZCTA-level Congdon Index file
saveRDS(congdonZCTA2, "~/nycZCTACongdon.RDS")
congdonZCTA2<-readRDS("~/nycZCTACongdon.RDS")
# merge covariates4 and congdon
# restrict congdon file to just index and zcta
congdon<-congdonZCTA2[,c("zcta","index")]
covariates5<-merge(covariates4, congdon, by="zcta", all.x=T, all.y=F)
str(covariates5)
names(covariates5)[24]<-"congdonIndex"
# NB: can get county and zip names from xWalk file and merge to the covariate file if needed (only helpful for queens), but difficult to merge to the covariate file, need to remove duplicate zcta's
# zipNames<-xWalk[-1,c("zcta5", "cntyname", "zipname")]
# names(zipNames)[1]<-"zcta"
# merge to covariates file
# covariates6<-merge(covariates5, zipNames, by="zcta", all.x=T, all.y=F)
# save the covariates file
Downloaded updated data from New York City OpenData portal.
newDat1<-read.csv("~/Demographic_Statistics_By_Zip_Code.csv",header=T, stringsAsFactors=F)
newDat1$JURISDICTION.NAME<-as.character(newDat1$JURISDICTION.NAME)
newDat2<-newDat1[,c("JURISDICTION.NAME","PERCENT.HISPANIC.LATINO","PERCENT.US.CITIZEN","PERCENT.RECEIVES.PUBLIC.ASSISTANCE" )]
names(newDat2)<-c("zcta", "propHisp", "propCitizen", "propPublicAssist2")
covariates6<-merge(covariates5, newDat2, by="zcta", all.x=T, all.Y=F)
Downloaded shapefile data on ZCTA-level cardiovascular and diabetes from Simply Analytics App via NYU Libraries
library(foreign)
# heart disease
shapefile1<-read.dbf("/~/SimplyAnalytics_Shapefiles_2020-04-19_15_07_55_9a48bb6f434ea2cdb7c907ec71313405.dbf")
str(shapefile1)
# variable_names.txt
# VALUE0 # Population, 2019
# VALUE1 % MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | HEART DISEASE/CONGESTIVE HEART FAILURE, 2019
names(shapefile1)[1]<-"zcta"
names(shapefile1)[4]<-"heartDisease"
heartDisease<-shapefile1[,c(1,4)]
head(heartDisease)
heartDisease$heartDisease[is.na(heartDisease$heartDisease)]<-0.001
covariates7<-merge(covariates6,heartDisease, by="zcta", all.x=T, all.Y=F)
# Other health metrics
# HTN, CAD, Emphysema
shapefile2<-read.dbf("~/SimplyAnalytics_Shapefiles_2020-04-19_15_12_56_576cfa55f705a769f6bfe050f89469ea.dbf")
str(shapefile2)
# VALUE0 # MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | HYPERTENSION/HIGH BLOOD PRESSURE, 2019
# VALUE1 % MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | HEART DISEASE/CONGESTIVE HEART FAILURE, 2019
# VALUE2 % MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | EMPHYSEMA, 2019
# VALUE3 # MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | HEART ATTACK/STROKE, 2019
# DM, COPD
shapefile3<-read.dbf("~/SimplyAnalytics_Shapefiles_2020-04-19_15_20_02_0fac73e47dd32eb23d46fa3821630f62.dbf")
str(shapefile3)
# VALUE0 # MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | DIABETES TYPE 1, 2019
# VALUE1 # MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | DIABETES TYPE 2, 2019
# VALUE2 # MEDICINE/DRUGS/AILMENTS | AILMENTS | AILMENTS HAVE/HAD IN LAST 12 MONTHS? | COPD (CHRONIC OBSTRUCTIVE PULMONARY DIS), 2019
names(shapefile3)[1]<-"zcta"
names(shapefile3)[5]<-"COPD"
COPD<-shapefile3[,c(1,5)]
COPD$COPD[is.na(COPD$COPD)]<-0.001
covariates8<-merge(covariates7,COPD, by="zcta", all.x=T, all.Y=F)
saveRDS(covariates8, "/Users/charlie/OneDrive - NYU Langone Health/covidINLA/covidNYCZCTA_covariates.RDS")
NYC DOHMH releasing COVID19 positive tests data on GitHub Files are updated every 2 days or so. Have to track down previous files by clicking on “History”, selecting the date, then clicking the three-dot elipsis to access the link to the csv for that date.
dat422<-read.csv("~/tests-by-zcta.csv",header=T, stringsAsFactors=F) # 4/22 data
# recentDat<-read.csv("https://github.com/nychealth/coronavirus-data/raw/master/tests-by-zcta.csv",header=T, stringsAsFactors=F)
saveRDS(dat422, "~/covidNYCZCTA_422.RDS")
Merge the covariates data to the most recent total testing data.
covariates<-readRDS("~/covidNYCZCTA_covariates.RDS")
dat1<-readRDS("/Users/charlie/OneDrive - NYU Langone Health/covidINLA/covidNYCZCTA_422.RDS")
names(dat1)[1]<-"zcta"
testDat<-merge(dat1, covariates, by="zcta", all.x=T, all.y=F )
# calculate rate positive per 10,000 population
testDat$rate<-testDat$Positive/testDat$POP2010*10000
# calcuate proportion positive per 10,000 tests
testDat$prop<-testDat$Positive/testDat$Total*10000
saveRDS(testDat, "~/covidINLA/testDat.RDS")
Read in spatial file of NYC ZCTAs. Index the map file to the test data file to restrict to ZCTAs with valid data entries.
nyc <- readShapePoly("~/nycZCTA/tl_2010_36_zcta510NYC.shp")
valid<-nyc$ZCTA5CE10 %in% testDat$ZCTA
nyc<-nyc[valid,]
plot(nyc)
# save as an RDS object
saveRDS(nyc, "~/nycMap4COVID.RDS")
Create an adjacency matrix “nb” object from the map file using . Manually edit the matrix to create adjacencies between boros. First, create the adjacency matrix “nb” object from the map file using . Then use utility to manually edit the nb object. Interactively click on map to select two areas to connect, enter “y” to connect them, enter “c” to continue. Save the edited adjacency object. Plot the default and the edited adjacency matrices.
zzz <- poly2nb(nyc)
nnb1<-edit.nb(zzz, polys=nyc)
# added contiguities between points
# 67 and 74
# 66 and 142
# 25 and 161
# 34 and 137
# 44 and 122
# 13 and 131
# 52 and 83
# 12 and 93
# 4 and 148
# 157 and 175
# 51 and 55
# extract coordinates from shape file to plot adjacency matrix
nyc2<-st_read("~/tl_2010_36_zcta510NYC.shp")
nyc2<-nyc2[valid,]
coords <- st_coordinates(st_centroid(st_geometry(nyc2)))
# plot default ajdacency matrix
jpeg('~/adjacencyMatrixPlot1.jpg')
plot(nyc)
plot.nb(zzz, coords, add=T)
dev.off()
# plot edited matrix
# plot
jpeg('~/adjacencyMatrixPlot2.jpg')
plot(nyc)
plot.nb(nnb1, coords, add=T)
dev.off()
# save the nnb object
saveRDS(nnb1, "~/nnb1.RDS")
After correcting the nb object, use to get the nb object into the correct format for INLA, saving the resulting object for later use in models in the covidINLA directory. Set up a path to it saved as an object named “nyc.adj”
nb2INLA("/Users/charlie/OneDrive - NYU Langone Health/covidINLA/nyc.graph", nnb1)
nyc.adj <- paste("/Users/charlie/OneDrive - NYU Langone Health/covidINLA/nyc.graph")
file.show(nyc.adj)
Peter Congdon. Suicide and Parasuicide in London: A Small-area Study. Urban Studies, Vol. 33, No. 1, 137-158, 1996 and Roman Pabayo, Beth E. Molnar, Nancy Street, and Ichiro Kawachi. The relationship between social fragmentation and sleep among adolescents living in Boston, Massachusetts. Journal of Public Health pp 1-12 J Public Health (2014) doi: 10.1093/pubmed/fdu001↩︎