{
 "metadata": {
  "name": "readdata"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# system modules\n",
      "import os\n",
      "import sys\n",
      "import logging\n",
      "import itertools\n",
      "\n",
      "# shortcuts by convention\n",
      "import numpy as np\n",
      "import matplotlib\n",
      "#matplotlib.use('Agg')\n",
      "import matplotlib.pyplot as plt\n",
      "\n",
      "import shapely\n",
      "import shapely.geometry\n",
      "# for tables\n",
      "import pandas\n",
      "import pandas.io\n",
      "\n",
      "# Use mx.DateTime objects because it can handle years 0<->200000, we can't use datetime, datetime64, time_struct's\n",
      "import mx.DateTime \n",
      "\n",
      "# There's a few issues with the other date modules\n",
      "\n",
      "# years > 9999, don't fit in the posix spec, thus don't work with python time_struct and datetime, which are built on posix\n",
      "\n",
      "# Another issue is with the calendar, the Julian -> Gregorian switch.\n",
      "# years where year % 100 == 0 and year <= 1500 did have a leap year (assuming you switch to julian calendar)\n",
      "# The Gregorian calendar began by skipping 10 calendar days, to restore March 21 as the date of the vernal equinox.\n",
      "# The Dutch provinces of Brabant, Zeeland and the Staten-Generaal also adopted it on 25 December 1582\n",
      "# Southern Netherlands (Limburg?) switched on 1 January 1583, the province of Holland followed switched on 12 January 1583\n",
      "\n",
      "# the only date type that handles this is mx.DateTime.JulianDate\n",
      "# If you use the gregorian calendar you'd have to go use non-leap year for 1500\n",
      "# So, because the input has dates like 1500-2-29, we have to assume that we're still using the Julian calendar.\n",
      "# Let's reset our clocks\n",
      "print 'Today is', mx.DateTime.now().Julian(), 'in the Julian calendar, which we are using here.'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Today is 2012-10-23 11:36:15.98 in the Julian calendar, which we are using here.\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# the time horizon\n",
      "horizon = 15\n",
      "\n",
      "# Dataset for caching\n",
      "# use hdf5 here (faster than csv, no metadata)\n",
      "store = pandas.io.pytables.HDFStore('store.h5')\n",
      "print store.keys()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[]\n"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# The input file\n",
      "filename = 'invoer/2000012310000_BorgharenHBV_Glue50.XML'\n",
      "#filename = 'sample.txt'\n",
      "# show the header\n",
      "with open(filename) as f:\n",
      "    for i, line in enumerate(f):\n",
      "        print line,\n",
      "        if i>2:\n",
      "            break"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Q.uh\n",
        "0001 01 02 00 00 H-MS-BORD\n",
        "0001 01 02 00 00    143.93\n",
        "0001 01 03 00 00    170.08\n"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "\n",
      "def read_file(filename):\n",
      "    \"\"\"Read the file, assuming the format YYYY[Y]? MM DD HH MM VALUE, yield years\"\"\"\n",
      "    # Creating an array of 2 ints and 1 double for 20k year is about 500MB of memory.\n",
      "    # Normaly you would just read everything in memory. But with an estimate of 50k year memory would be full. \n",
      "    # So we do a running mean. For convenience we generate data grouped by years.\n",
      "    with open(filename) as f:\n",
      "        f.next()\n",
      "        f.next()\n",
      "        # Keep a moving year in memory, yield 3 years (old, current, next)\n",
      "        currentyear = 0\n",
      "        for line in f:\n",
      "            # This is the most cpu expensive part. \n",
      "            # Data is in a mixed format so there is no easy way to do this....\n",
      "            row = line.split()\n",
      "            # No time, just dates, should be enough\n",
      "            date = mx.DateTime.JulianDate(int(row[0]), int(row[1]), int(row[2]))\n",
      "            \n",
      "            value = float(row[-1])\n",
      "            yield (date.year, date.day_of_year, value)\n",
      "\n",
      "def groupyears(data):\n",
      "    \"\"\"goup years in data\"\"\"\n",
      "    keyfunc = lambda x:x[0]\n",
      "    for year, daysvalues in itertools.groupby(data, keyfunc):\n",
      "        # return a tuple of all year data...\n",
      "        yield list(daysvalues)\n",
      "\n",
      "def two(iterator):\n",
      "    \"\"\"yield a running window of three elements\"\"\"\n",
      "    # generalize to n using stack pop, append\n",
      "    \n",
      "    # default to returning an empty list, assuming that elements are iterable\n",
      "    old = []\n",
      "    current = []\n",
      "    for item in iterator:\n",
      "        old = current\n",
      "        current = item\n",
      "        yield (old,current)\n",
      "\n",
      "# more generic.... but missing the first and last element\n",
      "# this would start at year 2 and end at year 19999\n",
      "def window(seq, n=3):\n",
      "    \"Returns a sliding window (of width n) over data from the iterable\"\n",
      "    \"   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...\"\n",
      "    # we're using iterators\n",
      "    it = iter(seq)\n",
      "    # take the first n elements\n",
      "    result = tuple(itertools.islice(it, n))\n",
      "    # and return it, except if len(result) < n\n",
      "    if len(result) == n:\n",
      "        yield result\n",
      "    # now loop over all the elements, rolling th window\n",
      "    for elem in it:\n",
      "        result = result[1:] + (elem,)\n",
      "        yield result"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def annualmaximum(annual, currentyear, horizon=15):\n",
      "    \"\"\"look up the annual maximum\"\"\"\n",
      "    # find the maximum discharge\n",
      "    idxmax = annual['discharge'].idxmax()\n",
      "    # make a window (not bigger than what we have...)\n",
      "    # some implicit assumptions, better check...\n",
      "    window = slice(max(idxmax - horizon,annual.index.min().item()), min(idxmax + horizon,annual.index.max().item()))\n",
      "    records = annual.ix[window]\n",
      "    records['eventyear'] = currentyear\n",
      "    # Create relative dates\n",
      "    records['day_from_event'] = np.asarray(records.index) - idxmax\n",
      "\n",
      "    # returns list of tuple instead of dataframe to reduce memory usage\n",
      "    # drop the index (x[0])\n",
      "    data = list(x[1:] for x in records[['eventyear','day_from_event','discharge']].itertuples())\n",
      "    return data\n",
      "\n",
      "# main loop to compute all the annual maxima\n",
      "def compute_annualmaxima(data):\n",
      "    allrecords = []\n",
      "    yearsiter = two(groupyears(data))\n",
      "    for i, (oldyear, currentyear) in enumerate(yearsiter):\n",
      "        # We do need a currentyear...\n",
      "        if not currentyear:\n",
      "            continue\n",
      "        # which year are we taking the max of\n",
      "        df = pandas.DataFrame(oldyear + currentyear, columns=('year', 'day_of_year', 'discharge'))\n",
      "        hydrologicyear = currentyear[0][0]-1\n",
      "        # We use days of year here, 274 == October 1st in a leap year\n",
      "        hydrologicyearfilter = np.logical_or(\n",
      "            np.logical_and(df['year'] == hydrologicyear, df['day_of_year'] >= 274),\n",
      "            np.logical_and(df['year'] == hydrologicyear+1, df['day_of_year'] < 274)\n",
      "            )\n",
      "        # Lookup the records surrounding the annual maximum\n",
      "        records = annualmaximum(df[hydrologicyearfilter], currentyear=hydrologicyear)\n",
      "        allrecords.extend(records)\n",
      "    df = pandas.DataFrame.from_records(allrecords, columns=['eventyear', 'day_from_event', 'discharge'])\n",
      "    return df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def compute_daysover(df):\n",
      "    grouped = df.groupby(['eventyear'])\n",
      "    dayscum = []\n",
      "    def count(x):\n",
      "        return len(x)\n",
      "    \n",
      "    for year, event in iter(grouped):\n",
      "        # let's do some geometry\n",
      "        x = [-15] + list(event['day_from_event']) + [15, -15]\n",
      "        y = [0] + list(event['discharge']) + [0, 0]\n",
      "        poly = shapely.geometry.Polygon(np.c_[x,y].tolist())\n",
      "        # let's start drawing\n",
      "        # some arbitrary top, so we can do an intersection\n",
      "        for low in range(0,5000,100):\n",
      "            high = low+100\n",
      "            # Create the left and right box\n",
      "            lbox = shapely.geometry.Polygon([[-15, high], [0, high], [0,low], [-15,low],[-15, high]])\n",
      "            rbox = shapely.geometry.Polygon([[0, high], [15, high], [15,low], [0,low],[0, high]])\n",
      "            # store the intersection areas\n",
      "            dayscum.append((low, high, lbox.intersection(poly).area, rbox.intersection(poly).area))\n",
      "    dayscum = np.array(dayscum)\n",
      "    daysoverdf = pandas.DataFrame(dict(start=dayscum[:,0], end=dayscum[:,1], left=dayscum[:,2], right=dayscum[:,3]))\n",
      "    return daysoverdf"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# cache\n",
      "# do we have data?\n",
      "if 'data' in store.keys():\n",
      "    data = store['data']\n",
      "else:\n",
      "    # store as a dataframe \n",
      "    # memory is about 250MB for 20k years\n",
      "    # to save memory, pass read_file iterator to compute_annualmaximum\n",
      "    data = pandas.DataFrame.from_records(list(read_file(filename)), columns=['year', 'day_of_year', 'discharge'])\n",
      "    store['data'] = data\n",
      "    store.flush()\n",
      "    \n",
      "# do we have the maxima?\n",
      "# compute_annualmaxima is O(n) in CPU and memory.\n",
      "# runtime +- 60ms per year (about 2m30 for 20k years)\n",
      "# memory +- 100MB for 20k years, 20MB on disk\n",
      "if 'annual_maxima' in store.keys():\n",
      "    df = store['annual_maxima']\n",
      "else:\n",
      "    # pass the data as an array, looping over a dataframe is a bit annoying\n",
      "    df = compute_annualmaxima(np.array(data))\n",
      "    store['annual_maxima'] = df\n",
      "    store.flush()\n",
      "\n",
      "# do we have the daysover?\n",
      "# Number of days over a certain discharge\n",
      "if 'daysover' in store.keys():\n",
      "    daysoverdf = store['daysover']\n",
      "else:\n",
      "    # pass the data as an array, looping over a dataframe is a bit annoying\n",
      "    daysoverdf = compute_daysover(df)\n",
      "    store['daysover'] = daysoverdf\n",
      "    store.flush()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "ename": "MemoryError",
       "evalue": "",
       "output_type": "pyerr",
       "traceback": [
        "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
        "\u001b[1;32m<ipython-input-22-2f838ae250ab>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     19\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     20\u001b[0m     \u001b[1;31m# pass the data as an array, looping over a dataframe is a bit annoying\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 21\u001b[1;33m     \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcompute_annualmaxima\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     22\u001b[0m     \u001b[0mstore\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'annual_maxima'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     23\u001b[0m     \u001b[0mstore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
        "\u001b[1;31mMemoryError\u001b[0m: "
       ]
      }
     ],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}