From Tango comes news of something amazing: a regularly updated cross-reference table for all the different player ID numbers out there. I immediately added this to my baseball database (which already includes Baseball Databank, Retrosheet, and Pitch f/x data). Since I already have a script that updates the pitch and player-level data every night, I figured I might as well add the ID data as well. I use R to do it, even though something like Python would probably be more appropriate, simply because I know R better so it was quicker to do it this way. The way the script is written assumes a Linux system and a MySQL database backend, but it could be adapted for other scenarios.
This script will download the latest ID database, unzip it, set up a MySQL database based on the fields that are present in the ID files, save the ID data into new files that are formatted for MySQL import, and then load the data files into the newly-created database. Note that the database “baseballid” must already exist, but it doesn’t need to have any tables in it.
Now I can just run R CMD BATCH update_baseballid.R every night, and I get a freshly updated database in the morning.
tmp <- paste(tempfile(),'zip',sep='.')
download.file("http://balco.sabr.org/data/baseballid/baseballid-latest.zip",tmp)
register <- read.csv(pipe(paste("unzip -p",tmp,"baseballid*/register.csv")), as.is=TRUE)
rosters <- read.csv(pipe(paste("unzip -p",tmp,"baseballid*/rosters.csv")), as.is=TRUE)
sqline <- function(var,varname) {
if(class(var)=="character") {
line <- paste("`",varname,"`"," varchar(",max(nchar(var)),") NOT NULL default ''",sep="")
} else {
line <- paste("`",varname,"`"," int(",floor(log10(max(var,na.rm=TRUE)))+1,") default NULL",sep="")
}
return(line)
}
register.query <- paste("CREATE TABLE registerĀ (",
paste(lapply(names(register), function(x) sqline(register[[x]], x)), collapse=',\n'),
", PRIMARY KEY (`key_uuid`)",
") ENGINE=MyISAM DEFAULT CHARSET=latin1")
rosters.query <- paste("CREATE TABLE rostersĀ (",
paste(lapply(names(rosters), function(x) sqline(rosters[[x]], x)), collapse=',\n'),
", PRIMARY KEY (`key_uuid`)",
") ENGINE=MyISAM DEFAULT CHARSET=latin1")
library(RMySQL)
drv <- dbDriver("MySQL")
con <- dbConnect(drv, username="<your username>", password="<your password>", dbname="baseballid", host="localhost")
dbGetQuery(con,"DROP TABLE IF EXISTS register")
dbGetQuery(con,"DROP TABLE IF EXISTS rosters")
dbGetQuery(con,register.query)
dbGetQuery(con,rosters.query)
for(v in names(register)) if(class(register[[v]])=="character") register[[v]][which(register[[v]]=='')] <- NA
write.csv(register, '/home/pefrase/Misc/pitchdb/tmp/register.csv', row.names=FALSE, na='NULL')
for(v in names(rosters)) if(class(rosters[[v]])=="character") rosters[[v]][which(rosters[[v]]=='')] <- NA
write.csv(rosters, '/home/pefrase/Misc/pitchdb/tmp/rosters.csv', row.names=FALSE, na='NULL')
dbGetQuery(con,"LOAD DATA LOCAL INFILE '/home/pefrase/Misc/pitchdb/tmp/register.csv'
INTO TABLE register
FIELDS
OPTIONALLY ENCLOSED BY '\"'
TERMINATED BY ',' LINES TERMINATED BY '\n'
IGNORE 1 LINES")
dbGetQuery(con,"LOAD DATA LOCAL INFILE '/home/pefrase/Misc/pitchdb/tmp/rosters.csv'
INTO TABLE rosters
FIELDS
OPTIONALLY ENCLOSED BY '\"'
TERMINATED BY ',' LINES TERMINATED BY '\n'
IGNORE 1 LINES")
system("rm /home/pefrase/Misc/pitchdb/tmp/*")