Getting PTV GTFS Into Your MySQL Database

PTV has finally released their trip information in GTFS format, but that's only half of the battle; to make use of this data, you'll need it in a format other than 'zip file of text files'. Because I (decided to, this week) love Bash, I though I'd use everyone's favourite shell to get up close with GTFS to MySQL translations.

You'll need a schema:

CREATE DATABASE IF NOT EXISTS `gtfs` DEFAULT CHARACTER SET latin1;
USE `gtfs`;

-- --------------------------------------------------------

--
-- Table structure for table `agency`
--

CREATE TABLE IF NOT EXISTS `agency` (
  `agency_id` varchar(11) NOT NULL,
  `agency_name` varchar(255) DEFAULT NULL,
  `agency_url` varchar(255) DEFAULT NULL,
  `agency_timezone` varchar(50) DEFAULT NULL,
  `agency_lang` varchar(2) NOT NULL,
  `mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;

-- --------------------------------------------------------

--
-- Table structure for table `calendar`
--

CREATE TABLE IF NOT EXISTS `calendar` (
  `service_id` varchar(15) NOT NULL DEFAULT '',
  `monday` tinyint(1) DEFAULT NULL,
  `tuesday` tinyint(1) DEFAULT NULL,
  `wednesday` tinyint(1) DEFAULT NULL,
  `thursday` tinyint(1) DEFAULT NULL,
  `friday` tinyint(1) DEFAULT NULL,
  `saturday` tinyint(1) DEFAULT NULL,
  `sunday` tinyint(1) DEFAULT NULL,
  `start_date` varchar(8) DEFAULT NULL,
  `end_date` varchar(8) DEFAULT NULL,
  `mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;

-- --------------------------------------------------------

--
-- Table structure for table `calendar_dates`
--

CREATE TABLE IF NOT EXISTS `calendar_dates` (
  `service_id` varchar(15) NOT NULL DEFAULT '',
  `date` varchar(8) NOT NULL DEFAULT '',
  `date_timestamp` int(11) DEFAULT NULL,
  `exception_type` int(2) DEFAULT NULL,
  `mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;

-- --------------------------------------------------------

--
-- Table structure for table `routes`
--

CREATE TABLE IF NOT EXISTS `routes` (
  `route_id` varchar(15) NOT NULL,
  `agency_id` varchar(11) DEFAULT NULL,
  `route_short_name` varchar(50) DEFAULT NULL,
  `route_long_name` varchar(255) DEFAULT NULL,
  `route_type` int(2) DEFAULT NULL,
  `route_text_color` varchar(255) DEFAULT NULL,
  `route_color` varchar(255) DEFAULT NULL,
  `route_url` varchar(255) DEFAULT NULL,
  `route_desc` varchar(255) DEFAULT NULL,
  `mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;

-- --------------------------------------------------------

--
-- Table structure for table `shapes`
--

CREATE TABLE IF NOT EXISTS `shapes` (
  `shape_id` varchar(30) NOT NULL,
  `shape_pt_lat` decimal(18,15) NOT NULL,
  `shape_pt_lon` decimal(18,15) NOT NULL,
  `shape_pt_sequence` int(11) NOT NULL,
  `shape_dist_traveled` double NOT NULL,
  `mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;

-- --------------------------------------------------------

--
-- Table structure for table `stops`
--

CREATE TABLE IF NOT EXISTS `stops` (
  `stop_id` varchar(20) NOT NULL,
  `stop_code` varchar(50) DEFAULT NULL,
  `stop_name` varchar(255) DEFAULT NULL,
  `stop_desc` varchar(255) DEFAULT NULL,
  `stop_lat` decimal(18,15) DEFAULT NULL,
  `stop_lon` decimal(18,15) DEFAULT NULL,
  `zone_id` int(11) DEFAULT NULL,
  `stop_url` varchar(255) DEFAULT NULL,
  `location_type` int(2) DEFAULT NULL,
  `parent_station` int(11) DEFAULT NULL,
  `mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;

-- --------------------------------------------------------

--
-- Table structure for table `stop_times`
--

CREATE TABLE IF NOT EXISTS `stop_times` (
  `trip_id` varchar(35) NOT NULL,
  `arrival_time` time DEFAULT NULL,
  `arrival_time_seconds` int(11) DEFAULT NULL,
  `departure_time` time DEFAULT NULL,
  `departure_time_seconds` int(11) DEFAULT NULL,
  `stop_id` int(11) DEFAULT NULL,
  `stop_sequence` int(11) DEFAULT NULL,
  `stop_headsign` varchar(50) DEFAULT NULL,
  `pickup_type` int(2) DEFAULT NULL,
  `drop_off_type` int(2) DEFAULT NULL,
  `shape_dist_traveled` varchar(50) DEFAULT NULL,
  `mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;

-- --------------------------------------------------------

--
-- Table structure for table `trips`
--

CREATE TABLE IF NOT EXISTS `trips` (
  `route_id` varchar(15) DEFAULT NULL,
  `service_id` varchar(15) DEFAULT NULL,
  `trip_id` varchar(30) NOT NULL,
  `trip_headsign` varchar(255) DEFAULT NULL,
  `trip_short_name` varchar(255) DEFAULT NULL,
  `direction_id` tinyint(1) DEFAULT NULL,
  `block_id` int(11) DEFAULT NULL,
  `shape_id` varchar(20) DEFAULT NULL,
  `mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;

--
-- Indexes for dumped tables
--

--
-- Indexes for table `agency`
--
ALTER TABLE `agency`
 ADD PRIMARY KEY (`agency_id`);

--
-- Indexes for table `calendar`
--
ALTER TABLE `calendar`
 ADD PRIMARY KEY (`service_id`);

--
-- Indexes for table `calendar_dates`
--
ALTER TABLE `calendar_dates`
 ADD PRIMARY KEY (`service_id`,`date`), ADD KEY `service_id` (`service_id`), ADD KEY `date_timestamp` (`date_timestamp`), ADD KEY `exception_type` (`exception_type`);

--
-- Indexes for table `routes`
--
ALTER TABLE `routes`
 ADD PRIMARY KEY (`route_id`), ADD KEY `agency_id` (`agency_id`), ADD KEY `route_type` (`route_type`);

--
-- Indexes for table `stops`
--
ALTER TABLE `stops`
 ADD PRIMARY KEY (`stop_id`), ADD KEY `zone_id` (`zone_id`), ADD KEY `stop_lat` (`stop_lat`), ADD KEY `stop_lon` (`stop_lon`), ADD KEY `location_type` (`location_type`), ADD KEY `parent_station` (`parent_station`);

--
-- Indexes for table `stop_times`
--
ALTER TABLE `stop_times`
 ADD KEY `trip_id` (`trip_id`), ADD KEY `arrival_time_seconds` (`arrival_time_seconds`), ADD KEY `departure_time_seconds` (`departure_time_seconds`), ADD KEY `stop_id` (`stop_id`), ADD KEY `stop_sequence` (`stop_sequence`), ADD KEY `pickup_type` (`pickup_type`), ADD KEY `drop_off_type` (`drop_off_type`);

--
-- Indexes for table `trips`
--
ALTER TABLE `trips`
 ADD PRIMARY KEY (`trip_id`), ADD KEY `route_id` (`route_id`), ADD KEY `service_id` (`service_id`), ADD KEY `direction_id` (`direction_id`), ADD KEY `block_id` (`block_id`), ADD KEY `shape_id` (`shape_id`);

I went a little bit key-happy; feel free to omit these in the name of space, albeit at the cost of query speed.

Code (which I assume you'll install in /home/joel/gtfs/ – adjust accordingly if not)

#!/bin/bash

FOLDER=/tmp/gtfs

if [ ! -d "$FOLDER" ]; then
        mkdir /tmp/gtfs
fi

rm -rf "$FOLDER"/*
curl http://data.ptv.vic.gov.au/downloads/gtfs.zip > "$FOLDER"/gtfs.zip
unzip -d "$FOLDER"/ "$FOLDER"/gtfs.zip
rm "$FOLDER"/gtfs.zip

python /home/joel/gtfs/gtfs-sql/truncate.py

for mode in "$FOLDER"/2 "$FOLDER"/3 "$FOLDER"/4
do
        echo "Processing $file"
        unzip -d "$mode"/ "$mode"/google_transit.zip

        for file2 in $mode/*.txt
        do
                python /home/joel/gtfs/gtfs-sql/bom.py "$file2"
        done

        for file in "$mode"/*.txt
        do
                COLS=$(head -n 1 "$file"|tr -d '\r\n')
                mysqlimport --verbose --local --columns="$COLS" --fields-terminated-by=',' --fields-optionally-enclosed-by='\"' --lines-terminated-by='\r\n' --ignore-lines=1 --use-threads=5 gtfs "$file"
                t=${file%.txt}
                mysql gtfs --batch --silent -e "update `basename ${t}` set mode = `basename ${mode}` where mode = 0"
        done
done

echo 'Done'

You will also need /home/joel/gtfs/gtfs-sql/bom.py:

import os, sys, codecs

BUFSIZE = 4096
BOMLEN = len(codecs.BOM_UTF8)

path = sys.argv[1]
with open(path, "r+b") as fp:
    chunk = fp.read(BUFSIZE)
    if chunk.startswith(codecs.BOM_UTF8):
        i = 0
        chunk = chunk[BOMLEN:]
        while chunk:
            fp.seek(i)
            fp.write(chunk)
            i += len(chunk)
            fp.seek(BOMLEN, os.SEEK_CUR)
            chunk = fp.read(BUFSIZE)
        fp.seek(-BOMLEN, os.SEEK_CUR)
        fp.truncate()

This is required because PTV specifically breaks the recommendation of the W3C and a bunch of other organisations and puts a Byte Order Mark (BOM) right where Unix utilities will choke on it.

Anyway, this will get you the Melbourne Metro PTV data, at the cost of around 1.5GB.