dtas_analysis/dtas_analysis.R

# Some opening notes for the stray reader of this code
# This is pretty messy stuff, pulled together in some haste.
# I'm aware that there are tons of places where repeated code
# should be set up as libraries - feel free to put in a
# pull request to this effect!
#
# I'm also aware that my method of creating a table of quantiles
# is massively inefficient. I'm sure there's a brisk way to
# do this, but couldn't find anything workable.
# As always, code review much appreciated and comments welcome!

# load libraries needed for operations
require(sp) # required for rgdal
require(rgdal) # required for readOGR
require(GISTools) # required for poly.counts
require(maptools) # required for spTransform
require(xtable) # required for xtables done below
require(RColorBrewer) # required for pretty colours
require(RCurl) # required for dataset downloads using curl

# Preliminaries------------------------------------------------
# Define CRS for here and later--------------------------------
wgs84 = '+proj=longlat +datum=WGS84'
bng = "+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000
+y_0=-100000 +datum=OSGB36 +units=m +no_defs +ellps=airy
+towgs84=446.448,-125.157,542.060,0.1502,0.2470,0.8421,-20.4894"
setwd("~/Dropbox/writing_articles_chapters/201805_dtas")

# Create data directory if needed:
if (dir.exists("data") == FALSE) {
  dir.create("data")
}

if (dir.exists("derivedData") == FALSE) {
  dir.create("derivedData")
}

# 1. Load in datasets

# read in polygon for UK admin boundaries----------------------
download.file("https://census.edina.ac.uk/ukborders/easy_download/prebuilt/shape/Scotland_laulevel1_2011.zip", destfile = "data/Scotland_laulevel1_2011.zip")
unzip("data/Scotland_laulevel1_2011.zip", exdir = "data")
admin <- readOGR("./data", "scotland_laulevel1_2011")
# transform CRS to wgs84 for comparisons below
admin.wgs <- spTransform(admin, CRS("+init=epsg:4326"))

# read in relevant polygons, Scottish Index of Multiple deprivation (already WGS84)
download.file("http://sedsh127.sedsh.gov.uk/Atom_data/ScotGov/ZippedShapefiles/SG_SIMD_2016.zip", destfile = "data/SG_SIMD_2016.zip")
unzip("data/SG_SIMD_2016.zip", exdir = "data")
simd <- readOGR("data", "SG_SIMD_2016")


# Load dtas dataset------------------------------------
# dtas <- read.csv(text=getURL("https://zenodo.org/record/198703/files/dtas_3.2.csv"))
# note: need to upload version 4 to zenodo repository!
dtas <- read.csv("data/dtas_4.0.csv")
coordinates(dtas) <- c("X", "Y")
proj4string(dtas) <- proj4string(simd)
# transform CRS to wgs84 for comparisons below
dtas.wgs <- spTransform(dtas, CRS("+init=epsg:4326"))

# note: need to upload data set to zenodo repository! Note: tbd; simplify to third decimal point for accurate representation of coordinate precision
# Load GWSF dataset ----------------------
gwsf <- readOGR("data", "gwsf_groups_gb_sct")
# transform CRS on admin (from BNG) to wgs84 for comparisons below
gwsf.wgs <- spTransform(gwsf, CRS("+init=epsg:4326"))

# Part I - Generate tables and plots based on SIMD ranks

# Subset SIMD for bottom 5/10/15% by overall rank
# Note: using hardcoded numbers for 5/10/15% figures. This should probably be calculated using a length() routine
simd.bottom15r <- simd[ which(simd$rank < 1047),]
simd.bottom10r <- simd[ which(simd$rank < 698),]
simd.bottom5r <- simd[ which(simd$rank < 349),]

# Subset SIMD for bottom 5/10/15% by income domain rank
simd.bottom15incr <- simd[ which(simd$incrank < 1047),]
simd.bottom10incr <- simd[ which(simd$incrank < 698),]
simd.bottom5incr <- simd[ which(simd$incrank < 349),]

# Subset SIMD for bottom 5/10/15% by income domain rank
simd.bottom15empr <- simd[ which(simd$emprank < 1047),]
simd.bottom10empr <- simd[ which(simd$emprank < 698),]
simd.bottom5empr <- simd[ which(simd$emprank < 349),]

# Now generate subsetted SPDF which only includes dtas points which are
# located in remaining (subsetted) SIMD polygons
dtas.simd.bottom15r <- dtas[!is.na(over(dtas, geometry(simd.bottom15r))),]
dtas.simd.bottom10r <- dtas[!is.na(over(dtas, geometry(simd.bottom10r))),]
dtas.simd.bottom5r <- dtas[!is.na(over(dtas, geometry(simd.bottom5r))),]
gwsf.simd.bottom15r <- gwsf[!is.na(over(gwsf, geometry(simd.bottom15r))),]


# Add data from relevant SIMD polygons for each row in dtas DF
dtas.simd.bottom15r@data <- cbind(dtas.simd.bottom15r@data,over(dtas.simd.bottom15r,simd.bottom15r))
dtas.simd.bottom10r@data <- cbind(dtas.simd.bottom10r@data,over(dtas.simd.bottom10r,simd.bottom10r))
dtas.simd.bottom5r@data <- cbind(dtas.simd.bottom5r@data,over(dtas.simd.bottom5r,simd.bottom5r))

gwsf.simd.bottom15r@data <- cbind(gwsf.simd.bottom15r@data,over(gwsf.simd.bottom15r,simd.bottom15r))

# Convert to dataframe, strip out unnecessary columns, and sort by rank
dtas.simd.bottom15r.tidy <- data.frame("name"=dtas.simd.bottom15r$name,"rank"=dtas.simd.bottom15r$rank)
dtas.simd.bottom15r.tidy <- dtas.simd.bottom15r.tidy[with(dtas.simd.bottom15r.tidy, order(rank)), ]
dtas.simd.bottom10r.tidy <- data.frame("name"=dtas.simd.bottom10r$name,"rank"=dtas.simd.bottom10r$rank)
dtas.simd.bottom10r.tidy <- dtas.simd.bottom10r.tidy[with(dtas.simd.bottom10r.tidy, order(rank)), ]
dtas.simd.bottom5r.tidy <- data.frame("name"=dtas.simd.bottom5r$name,"rank"=dtas.simd.bottom5r$rank)
dtas.simd.bottom5r.tidy <- dtas.simd.bottom5r.tidy[with(dtas.simd.bottom5r.tidy, order(rank)), ]

gwsf.simd.bottom15r.tidy <- data.frame("name"=gwsf.simd.bottom15r$name,"rank"=gwsf.simd.bottom15r$rank)
gwsf.simd.bottom15r.tidy <- gwsf.simd.bottom15r.tidy[with(gwsf.simd.bottom15r.tidy, order(rank)), ]


# Export csv tables for use in other applications
write.csv(dtas.simd.bottom15r, "derivedData/dtas.simd.bottom15r.csv", row.names=FALSE)
write.csv(gwsf.simd.bottom15r, "derivedData/gwsf.simd.bottom15r.csv", row.names=FALSE)


# Generate table for DTAS using xtable for bottom 15% simd rank
xt <- xtable(dtas.simd.bottom15r.tidy, caption = "table of DTAS groups in bottom 15%")

print(xt,
      type = "html",
      tabular.environment = "longtable",
      sanitize.colnames.function = function(x){x},
      include.rownames = FALSE,
      floating = FALSE)

# Generate table for GWSF using xtable for bottom 15% simd rank

xt <- xtable(gwsf.simd.bottom15r.tidy, caption = "table of GWSF groups in bottom 15%")

print(xt,
      type = "html",
      tabular.environment = "longtable",
      sanitize.colnames.function = function(x){x},
      include.rownames = FALSE,
      floating = FALSE)

# End Part I

# Part II - Calculate coincidence of other features

# Load in POI table

# Sadly, this dataset is paywalled by digimap, though most academic researchers should be able to aquire it through
# an institutional subscription from Digimap, Ordnance Survey Data Download. The POI set is located under
# "Boundary and Location Data" under "Points of Interest," "select visible area" for whole UK then download.
#
# I'm going to take some shortcuts here, preventing this code from being truly reproducible
# because the poi dataset is really huge - I've already downloaded and filtered to specific codes for features
# so code immediately below is untested. Will need to adjust below to the actual code I'm working with eventually.

## Skip ==>
# poi <- read.csv("data/poi.csv", header = FALSE, sep = "|")
# coordinates(poi) <- c("Feature Easting", "Feature Northing")
# proj4string(poi) <- proj4string(CRS(bng))
## transform CRS on churches (from BNG) to wgs84 for comparisons below
# poi.wgs <- spTransform(poi, CRS(wgs84))

## subsets
#poi.pubs <- poi[!is.na(poi[3] = "01020034")]
#poi.chequecashing <- poi[!is.na(poi[3] = "02090142")]
#poi.pawnbrokers <- poi[!is.na(poi[3] = "02090151")]
#poi.worship <- poi[!is.na(poi[3] = "06340459")]
# ==> to here for now

# Load in retail data from geolytics dataset
# from here: https://geolytix.co.uk/?retail_points
poi.grocery <- read.csv("data/retailpoints_version11_dec17.txt", sep = "\t")
# select useful columns
poi.grocery <- subset(poi.grocery, select = c("retailer", "store_name", "long_wgs", "lat_wgs"))
# convert to spdf
coordinates(poi.grocery) <- c("long_wgs", "lat_wgs")
proj4string(poi.grocery) <- CRS("+init=epsg:4326")
# filter out non-Scottish data
poi.grocery <- poi.grocery[!is.na(over(poi.grocery, geometry(admin.wgs))),]

# Load in British pubs from Ordnance survey dataset
poi.pubs <- read.csv("data/poi_pubs.csv", header = FALSE, sep = "|")
# select useful columns
poi.pubs <- subset(poi.pubs, select = c("V1", "V2", "V3", "V4", "V5"))
# rename columns to tidier names
colnames(poi.pubs) <- c("refnum", "name", "code", "x", "y")
coordinates(poi.pubs) <- c("x", "y")
proj4string(poi.pubs) <- CRS("+init=epsg:27700")
# transform CRS on pubs (from BNG) to wgs84 for comparisons below
poi.pubs.wgs <- spTransform(poi.pubs, CRS("+init=epsg:4326"))
# filter out non-Scottish pubs
poi.pubs.wgs <- poi.pubs.wgs[!is.na(over(poi.pubs.wgs, geometry(admin.wgs))),]

# Load in check cashing from Ordnance survey dataset
poi.checkcashing <- read.csv("data/poi_chequecashing.csv", header = FALSE, sep = "|")
# select useful columns
poi.checkcashing <- subset(poi.checkcashing, select = c("V1", "V2", "V3", "V4", "V5"))
# rename columns to tidier names
colnames(poi.checkcashing) <- c("refnum", "name", "code", "x", "y")
coordinates(poi.checkcashing) <- c("x", "y")
proj4string(poi.checkcashing) <- CRS("+init=epsg:27700")
# transform CRS on pubs (from BNG) to wgs84 for comparisons below
poi.checkcashing.wgs <- spTransform(poi.checkcashing, CRS("+init=epsg:4326"))
# filter out non-Scottish (hah!) check cashing shops
poi.checkcashing.wgs <- poi.checkcashing.wgs[!is.na(over(poi.checkcashing.wgs, geometry(admin.wgs))),]

# Load in pawnbrokers from Ordnance survey dataset
poi.pawnbrokers <- read.csv("data/poi_pawnbrokers.csv", header = FALSE, sep = "|")
# select useful columns
poi.pawnbrokers <- subset(poi.pawnbrokers, select = c("V1", "V2", "V3", "V4", "V5"))
# rename columns to tidier names
colnames(poi.pawnbrokers) <- c("refnum", "name", "code", "x", "y")
coordinates(poi.pawnbrokers) <- c("x", "y")
proj4string(poi.pawnbrokers) <- CRS("+init=epsg:27700")
# transform CRS on pawnbrokers (from BNG) to wgs84 for comparisons below
poi.pawnbrokers.wgs <- spTransform(poi.pawnbrokers, CRS("+init=epsg:4326"))
# filter out non-Scottish (hah!) pawnbrokers
poi.pawnbrokers.wgs <- poi.pawnbrokers.wgs[!is.na(over(poi.pawnbrokers.wgs, geometry(admin.wgs))),]

# Load in places of worship from Ordnance survey dataset
poi.pow <- read.csv("data/poi_pow.csv", header = FALSE, sep = "|")
# select useful columns
poi.pow <- subset(poi.pow, select = c("V1", "V2", "V3", "V4", "V5"))
# rename columns to tidier names
colnames(poi.pow) <- c("refnum", "name", "code", "x", "y")
coordinates(poi.pow) <- c("x", "y")
proj4string(poi.pow) <- CRS("+init=epsg:27700")
# transform CRS on pubs (from BNG) to wgs84 for comparisons below
poi.pow.wgs <- spTransform(poi.pow, CRS("+init=epsg:4326"))
# filter out non-Scottish (hah!) check cashing shops
poi.pow.wgs <- poi.pow.wgs[!is.na(over(poi.pow.wgs, geometry(admin.wgs))),]


# create WGS versions of SIMD subsets from above for comparisons below
simd.bottom15r.wgs <- spTransform(simd.bottom15r, CRS("+init=epsg:4326"))
simd.bottom10r.wgs <- spTransform(simd.bottom10r, CRS("+init=epsg:4326"))
simd.bottom5r.wgs <- spTransform(simd.bottom5r, CRS("+init=epsg:4326"))


# Test it out with a quick plot
plot(poi.pubs.wgs)

# run further subsetting, as above, for specific bands of simd
grocery.simd.bottom15r.wgs <- poi.grocery[!is.na(over(poi.grocery, geometry(simd.bottom15r.wgs))),]
grocery.simd.bottom10r.wgs <- poi.grocery[!is.na(over(poi.grocery, geometry(simd.bottom10r.wgs))),]
grocery.simd.bottom5r.wgs <- poi.grocery[!is.na(over(poi.grocery, geometry(simd.bottom5r.wgs))),]

pubs.simd.bottom15r.wgs <- poi.pubs.wgs[!is.na(over(poi.pubs.wgs, geometry(simd.bottom15r.wgs))),]
pubs.simd.bottom10r.wgs <- poi.pubs.wgs[!is.na(over(poi.pubs.wgs, geometry(simd.bottom10r.wgs))),]
pubs.simd.bottom5r.wgs <- poi.pubs.wgs[!is.na(over(poi.pubs.wgs, geometry(simd.bottom5r.wgs))),]

pawnbrokers.simd.bottom15r.wgs <- poi.pawnbrokers.wgs[!is.na(over(poi.pawnbrokers.wgs, geometry(simd.bottom15r.wgs))),]
checkcashing.simd.bottom15r.wgs <- poi.checkcashing.wgs[!is.na(over(poi.checkcashing.wgs, geometry(simd.bottom15r.wgs))),]
pow.simd.bottom15r.wgs <- poi.pow.wgs[!is.na(over(poi.pow.wgs, geometry(simd.bottom15r.wgs))),]


# Part II.2 Run some calculations on presence for various types of entities

# Calculate total number of DTAS "community anchors" within bottom 15%
length(dtas.simd.bottom15r$rank) /  length(dtas$name)

# Calculate percentage of total pubs in Scotland located within bottom 15% of IMD by rank
length(pubs.simd.bottom15r.wgs$name) / length(poi.pubs.wgs$name)

# places of worship
length(pow.simd.bottom15r.wgs$name) / length(poi.pow.wgs$name)

# Calculate percentage of total retail grocers in Scotland
# located within bottom 15% of IMD by rank
length(grocery.simd.bottom15r.wgs$store_name) / length(poi.grocery$store_name)

# drill a bit deeper - morrisons stores
morrisons.scotland <- poi.grocery[poi.grocery$retailer == "Morrisons", ]
morrisons.bottom15r.scotland <- grocery.simd.bottom15r.wgs[grocery.simd.bottom15r.wgs$retailer == "Morrisons", ]
length(morrisons.bottom15r.scotland$store_name) / length(morrisons.scotland)

# sainsburys
sainsburys.scotland <- poi.grocery[poi.grocery$retailer == "Sainsburys", ]
sainsburys.bottom15r.scotland <- grocery.simd.bottom15r.wgs[grocery.simd.bottom15r.wgs$retailer == "Sainsburys", ]
length(sainsburys.bottom15r.scotland$store_name) / length(sainsburys.scotland)

# check cashing
plot(poi.checkcashing.wgs)
par(new=FALSE)
plot(checkcashing.simd.bottom15r.wgs, col="blue")
length(checkcashing.simd.bottom15r.wgs$name) / length(poi.checkcashing.wgs$name)

# pawnbrokers
length(pawnbrokers.simd.bottom15r.wgs$name) / length(poi.pawnbrokers.wgs$name)