{ "metadata": { "name": "readdata" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "# system modules\n", "import os\n", "import sys\n", "import logging\n", "import itertools\n", "\n", "# shortcuts by convention\n", "import numpy as np\n", "import matplotlib\n", "#matplotlib.use('Agg')\n", "import matplotlib.pyplot as plt\n", "\n", "import shapely\n", "import shapely.geometry\n", "# for tables\n", "import pandas\n", "import pandas.io\n", "\n", "# Use mx.DateTime objects because it can handle years 0<->200000, we can't use datetime, datetime64, time_struct's\n", "import mx.DateTime \n", "\n", "# There's a few issues with the other date modules\n", "\n", "# years > 9999, don't fit in the posix spec, thus don't work with python time_struct and datetime, which are built on posix\n", "\n", "# Another issue is with the calendar, the Julian -> Gregorian switch.\n", "# years where year % 100 == 0 and year <= 1500 did have a leap year (assuming you switch to julian calendar)\n", "# The Gregorian calendar began by skipping 10 calendar days, to restore March 21 as the date of the vernal equinox.\n", "# The Dutch provinces of Brabant, Zeeland and the Staten-Generaal also adopted it on 25 December 1582\n", "# Southern Netherlands (Limburg?) switched on 1 January 1583, the province of Holland followed switched on 12 January 1583\n", "\n", "# the only date type that handles this is mx.DateTime.JulianDate\n", "# If you use the gregorian calendar you'd have to go use non-leap year for 1500\n", "# So, because the input has dates like 1500-2-29, we have to assume that we're still using the Julian calendar.\n", "# Let's reset our clocks\n", "print 'Today is', mx.DateTime.now().Julian(), 'in the Julian calendar, which we are using here.'" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Today is 2012-10-23 11:36:15.98 in the Julian calendar, which we are using here.\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "# the time horizon\n", "horizon = 15\n", "\n", "# Dataset for caching\n", "# use hdf5 here (faster than csv, no metadata)\n", "store = pandas.io.pytables.HDFStore('store.h5')\n", "print store.keys()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[]\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "# The input file\n", "filename = 'invoer/2000012310000_BorgharenHBV_Glue50.XML'\n", "#filename = 'sample.txt'\n", "# show the header\n", "with open(filename) as f:\n", " for i, line in enumerate(f):\n", " print line,\n", " if i>2:\n", " break" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Q.uh\n", "0001 01 02 00 00 H-MS-BORD\n", "0001 01 02 00 00 143.93\n", "0001 01 03 00 00 170.08\n" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "\n", "def read_file(filename):\n", " \"\"\"Read the file, assuming the format YYYY[Y]? MM DD HH MM VALUE, yield years\"\"\"\n", " # Creating an array of 2 ints and 1 double for 20k year is about 500MB of memory.\n", " # Normaly you would just read everything in memory. But with an estimate of 50k year memory would be full. \n", " # So we do a running mean. For convenience we generate data grouped by years.\n", " with open(filename) as f:\n", " f.next()\n", " f.next()\n", " # Keep a moving year in memory, yield 3 years (old, current, next)\n", " currentyear = 0\n", " for line in f:\n", " # This is the most cpu expensive part. \n", " # Data is in a mixed format so there is no easy way to do this....\n", " row = line.split()\n", " # No time, just dates, should be enough\n", " date = mx.DateTime.JulianDate(int(row[0]), int(row[1]), int(row[2]))\n", " \n", " value = float(row[-1])\n", " yield (date.year, date.day_of_year, value)\n", "\n", "def groupyears(data):\n", " \"\"\"goup years in data\"\"\"\n", " keyfunc = lambda x:x[0]\n", " for year, daysvalues in itertools.groupby(data, keyfunc):\n", " # return a tuple of all year data...\n", " yield list(daysvalues)\n", "\n", "def two(iterator):\n", " \"\"\"yield a running window of three elements\"\"\"\n", " # generalize to n using stack pop, append\n", " \n", " # default to returning an empty list, assuming that elements are iterable\n", " old = []\n", " current = []\n", " for item in iterator:\n", " old = current\n", " current = item\n", " yield (old,current)\n", "\n", "# more generic.... but missing the first and last element\n", "# this would start at year 2 and end at year 19999\n", "def window(seq, n=3):\n", " \"Returns a sliding window (of width n) over data from the iterable\"\n", " \" s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...\"\n", " # we're using iterators\n", " it = iter(seq)\n", " # take the first n elements\n", " result = tuple(itertools.islice(it, n))\n", " # and return it, except if len(result) < n\n", " if len(result) == n:\n", " yield result\n", " # now loop over all the elements, rolling th window\n", " for elem in it:\n", " result = result[1:] + (elem,)\n", " yield result" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "def annualmaximum(annual, currentyear, horizon=15):\n", " \"\"\"look up the annual maximum\"\"\"\n", " # find the maximum discharge\n", " idxmax = annual['discharge'].idxmax()\n", " # make a window (not bigger than what we have...)\n", " # some implicit assumptions, better check...\n", " window = slice(max(idxmax - horizon,annual.index.min().item()), min(idxmax + horizon,annual.index.max().item()))\n", " records = annual.ix[window]\n", " records['eventyear'] = currentyear\n", " # Create relative dates\n", " records['day_from_event'] = np.asarray(records.index) - idxmax\n", "\n", " # returns list of tuple instead of dataframe to reduce memory usage\n", " # drop the index (x[0])\n", " data = list(x[1:] for x in records[['eventyear','day_from_event','discharge']].itertuples())\n", " return data\n", "\n", "# main loop to compute all the annual maxima\n", "def compute_annualmaxima(data):\n", " allrecords = []\n", " yearsiter = two(groupyears(data))\n", " for i, (oldyear, currentyear) in enumerate(yearsiter):\n", " # We do need a currentyear...\n", " if not currentyear:\n", " continue\n", " # which year are we taking the max of\n", " df = pandas.DataFrame(oldyear + currentyear, columns=('year', 'day_of_year', 'discharge'))\n", " hydrologicyear = currentyear[0][0]-1\n", " # We use days of year here, 274 == October 1st in a leap year\n", " hydrologicyearfilter = np.logical_or(\n", " np.logical_and(df['year'] == hydrologicyear, df['day_of_year'] >= 274),\n", " np.logical_and(df['year'] == hydrologicyear+1, df['day_of_year'] < 274)\n", " )\n", " # Lookup the records surrounding the annual maximum\n", " records = annualmaximum(df[hydrologicyearfilter], currentyear=hydrologicyear)\n", " allrecords.extend(records)\n", " df = pandas.DataFrame.from_records(allrecords, columns=['eventyear', 'day_from_event', 'discharge'])\n", " return df" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "def compute_daysover(df):\n", " grouped = df.groupby(['eventyear'])\n", " dayscum = []\n", " def count(x):\n", " return len(x)\n", " \n", " for year, event in iter(grouped):\n", " # let's do some geometry\n", " x = [-15] + list(event['day_from_event']) + [15, -15]\n", " y = [0] + list(event['discharge']) + [0, 0]\n", " poly = shapely.geometry.Polygon(np.c_[x,y].tolist())\n", " # let's start drawing\n", " # some arbitrary top, so we can do an intersection\n", " for low in range(0,5000,100):\n", " high = low+100\n", " # Create the left and right box\n", " lbox = shapely.geometry.Polygon([[-15, high], [0, high], [0,low], [-15,low],[-15, high]])\n", " rbox = shapely.geometry.Polygon([[0, high], [15, high], [15,low], [0,low],[0, high]])\n", " # store the intersection areas\n", " dayscum.append((low, high, lbox.intersection(poly).area, rbox.intersection(poly).area))\n", " dayscum = np.array(dayscum)\n", " daysoverdf = pandas.DataFrame(dict(start=dayscum[:,0], end=dayscum[:,1], left=dayscum[:,2], right=dayscum[:,3]))\n", " return daysoverdf" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "# cache\n", "# do we have data?\n", "if 'data' in store.keys():\n", " data = store['data']\n", "else:\n", " # store as a dataframe \n", " # memory is about 250MB for 20k years\n", " # to save memory, pass read_file iterator to compute_annualmaximum\n", " data = pandas.DataFrame.from_records(list(read_file(filename)), columns=['year', 'day_of_year', 'discharge'])\n", " store['data'] = data\n", " store.flush()\n", " \n", "# do we have the maxima?\n", "# compute_annualmaxima is O(n) in CPU and memory.\n", "# runtime +- 60ms per year (about 2m30 for 20k years)\n", "# memory +- 100MB for 20k years, 20MB on disk\n", "if 'annual_maxima' in store.keys():\n", " df = store['annual_maxima']\n", "else:\n", " # pass the data as an array, looping over a dataframe is a bit annoying\n", " df = compute_annualmaxima(np.array(data))\n", " store['annual_maxima'] = df\n", " store.flush()\n", "\n", "# do we have the daysover?\n", "# Number of days over a certain discharge\n", "if 'daysover' in store.keys():\n", " daysoverdf = store['daysover']\n", "else:\n", " # pass the data as an array, looping over a dataframe is a bit annoying\n", " daysoverdf = compute_daysover(df)\n", " store['daysover'] = daysoverdf\n", " store.flush()" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "MemoryError", "evalue": "", "output_type": "pyerr", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mMemoryError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[1;31m# pass the data as an array, looping over a dataframe is a bit annoying\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 21\u001b[1;33m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcompute_annualmaxima\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 22\u001b[0m \u001b[0mstore\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'annual_maxima'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[0mstore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mMemoryError\u001b[0m: " ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }