Automatically create geographical itemset.csv for external cascading select

chrissyhroberts · October 2, 2018, 9:38pm

Cross-posting this from the tech support forum in case useful.

This R script will download administrative data for a list of countries specified by the user, then create an itemset.csv file that can be used in cascading selects such as Country > Region > District > Sub-district.

Data downloads from diva.gis and so license inherited from their data sets.
Script is MIT license.
Change this line to set the countries.

countries<-c("UGA","COD")

Use ISO-3166 Alpha 3 codes [https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3].

###############################################
# Using Administrative data from DIVA-GIS,
# this script will convert the geographical
# administrative data (levels 0-3)
# for one or more country
# and output an itemsets.csv file
# that works with ODK and preserves accented
# characters (tested only on roman characters)
# Also removes accents from internal variables
# like 'name'. Accents will appear on screen but
# won't be preserved in data frames.
################################################
# MIT LICENCE Copyright © 2018 <chrissy.roberts@lshtm.ac.uk>
################################################
# INSTRUCTIONS
# First get administrative data from http://www.diva-gis.org/datadown
# for each of the countries you want to include
# unzip to a folder in the same directory as this script
# this example uses Uganda and Democratic Republic of Congo.
if(!dir.exists("download"))(dir.create("download"))
system("rm -rf download/*")
# Specify ISO codes (3 digit) to tell R which data sets to include

#https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3

countries<-c("UGA","COD")


#for each country, find the data set in the folder structure and load up then bind together
for(i in 1:length(countries))
{
  download.file(url = paste("http://biogeo.ucdavis.edu/data/diva/adm/",countries[i],"_adm.zip",sep=""),destfile =paste("download/",countries[i],"_adm.zip",sep=""))
  system (paste("unzip download/",countries[i],"_adm.zip -d download/",countries[i],sep=""))
  level3data<-read.csv(list.files(pattern = paste(countries[i],"_adm3.csv",sep=""),full.names = T,recursive = T))
  level3data<-level3data[,c("NAME_0","NAME_1","NAME_2","NAME_3","ISO")]
  if(i==1){full.data.output<-level3data}
  if(i!=1){full.data.output<-rbind(full.data.output,level3data)}
  system ("rm download/license.txt")
}

#set correct names so that label is descriptive country name
names(full.data.output)[which(names(full.data.output)=="NAME_0")]<-"label"
#set correct names so that ISO code is NAME_0 (level 0 country data)
names(full.data.output)[which(names(full.data.output)=="ISO")]<-"NAME_0"

#get rid of whitespace and dots and so on [might need to add more]
full.data.output<-full.data.output[,c("NAME_0","NAME_1","NAME_2","NAME_3","label")]
full.data.output$NAME_1<-gsub(full.data.output$NAME_1,pattern = "/| |'|//.",replacement = "_")
full.data.output$NAME_2<-gsub(full.data.output$NAME_2,pattern = "/| |'|//.",replacement = "_")
full.data.output$NAME_3<-gsub(full.data.output$NAME_3,pattern = "/| |'|//.",replacement = "_")

#find level zero and blank out levels 1,2,3
level0<-full.data.output
level0[,2:4]<-""
level0<-unique(level0)
level0$list_name<-"NAME_0"


#find level one and blank out levels 2,3
level1<-full.data.output[,c(2,1,3,4,5)]
level1[,3:4]<-""
level1<-unique(level1)
level1$list_name<-"NAME_1"
level1$label<-level1$NAME_1
x<-level1
x$NAME_1<-""
x$label<-""
x<-unique(x)
x$NAME_1<-"Other"
x$label<-"Other"
level1<-rbind(level1,x)
rm(x)


#find level two and blank out level 3
level2<-full.data.output[,c(3,1,2,4,5)]
level2[,4]<-""
level2<-unique(level2)
level2$list_name<-"NAME_2"
level2$label<-level2$NAME_2
x<-level2
x$NAME_2<-""
x$label<-""
x<-unique(x)
x$NAME_2<-"Other"
x$label<-"Other"
level2<-rbind(level2,x)
rm(x)

#find level three
level3<-full.data.output[,c(4,1,2,3,5)]
level3<-unique(level3)
level3$list_name<-"NAME_3"
level3$label<-level3$NAME_3
x<-level3
x$NAME_3<-""
x$label<-""
x<-unique(x)
x$NAME_3<-"Other"
x$label<-"Other"
level3<-rbind(level3,x)
rm(x)


#put it all together
output<-level0
output<-rbind(output,level1)
output<-rbind(output,level2)
output<-rbind(output,level3)

#define function to remove accents from text
removeAccents<-function(x)
{
  a <- c('À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ÿ', 'Ā', 'ā', 'Ă', 'ă', 'Ą', 'ą', 'Ć', 'ć', 'Ĉ', 'ĉ', 'Ċ', 'ċ', 'Č', 'č', 'Ď', 'ď', 'Đ', 'đ', 'Ē', 'ē', 'Ĕ', 'ĕ', 'Ė', 'ė', 'Ę', 'ę', 'Ě', 'ě', 'Ĝ', 'ĝ', 'Ğ', 'ğ', 'Ġ', 'ġ', 'Ģ', 'ģ', 'Ĥ', 'ĥ', 'Ħ', 'ħ', 'Ĩ', 'ĩ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'Į', 'į', 'İ', 'ı', 'Ĳ', 'ĳ', 'Ĵ', 'ĵ', 'Ķ', 'ķ', 'Ĺ', 'ĺ', 'Ļ', 'ļ', 'Ľ', 'ľ', 'Ŀ', 'ŀ', 'Ł', 'ł', 'Ń', 'ń', 'Ņ', 'ņ', 'Ň', 'ň', 'ŉ', 'Ō', 'ō', 'Ŏ', 'ŏ', 'Ő', 'ő', 'Œ', 'œ', 'Ŕ', 'ŕ', 'Ŗ', 'ŗ', 'Ř', 'ř', 'Ś', 'ś', 'Ŝ', 'ŝ', 'Ş', 'ş', 'Š', 'š', 'Ţ', 'ţ', 'Ť', 'ť', 'Ŧ', 'ŧ', 'Ũ', 'ũ', 'Ū', 'ū', 'Ŭ', 'ŭ', 'Ů', 'ů', 'Ű', 'ű', 'Ų', 'ų', 'Ŵ', 'ŵ', 'Ŷ', 'ŷ', 'Ÿ', 'Ź', 'ź', 'Ż', 'ż', 'Ž', 'ž', 'ſ', 'ƒ', 'Ơ', 'ơ', 'Ư', 'ư', 'Ǎ', 'ǎ', 'Ǐ', 'ǐ', 'Ǒ', 'ǒ', 'Ǔ', 'ǔ', 'Ǖ', 'ǖ', 'Ǘ', 'ǘ', 'Ǚ', 'ǚ', 'Ǜ', 'ǜ', 'Ǻ', 'ǻ', 'Ǽ', 'ǽ', 'Ǿ', 'ǿ');
  b <- c('A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'O', 'U', 'U', 'U', 'U', 'Y', 's', 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 'n', 'o', 'o', 'o', 'o', 'o', 'o', 'u', 'u', 'u', 'u', 'y', 'y', 'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'IJ', 'ij', 'J', 'j', 'K', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', 'l', 'l', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', 'O', 'o', 'O', 'o', 'O', 'o', 'OE', 'oe', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's', 'f', 'O', 'o', 'U', 'u', 'A', 'a', 'I', 'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'A', 'a', 'AE', 'ae', 'O', 'o');
  for(i in 1:length(a))
  {
    x<-gsub(x = x,pattern = a[i],replacement = b[i])
  }
  return(x)
}

# remove accents from name, levels 0-3, leaving them only in label.
output<-output[,c("list_name","NAME_0","NAME_1","NAME_2","NAME_3","label")]
output$name<-output$label
output$name<-removeAccents(output$name)
output$NAME_0<-removeAccents(output$NAME_0)
output$NAME_1<-removeAccents(output$NAME_1)
output$NAME_2<-removeAccents(output$NAME_2)
output$NAME_3<-removeAccents(output$NAME_3)

head(output)

#write output to itemsets.csv
write.csv(x = output,file = "itemsets.csv",quote = F,row.names = F)