Source code for tigercontrol.utils.dataset_registry

from __future__ import division
from __future__ import print_function

import os
import shutil
import xlrd
import datetime
import csv
import pandas as pd
import numpy as np
from tigercontrol.utils.download_tools import *


def to_datetime(date, time):
    """ 
    Description: Takes a date and a time and converts it to a datetime object.
    Args:
        date (string): Date in DD/MM/YYYY format
        time (string): Time in hh:mm format
    Returns:
        Datetime object containing date and time information
    """
    day_month_year = [int(x) for x in date.split('/')]
    hour_min = [int(x) for x in time.split(':')]

    return datetime.datetime(day_month_year[2], 
                             day_month_year[1], 
                             day_month_year[0], 
                             hour_min[0], 
                             hour_min[1])

def datetime_to_daysElapsed(cur_datetime, base_datetime):
    """
    Description:
        Computes the number of days elapsed since 'base' date.

    Args:
        cur_datetime (datetime): Current date and time
        base_datetime (datetime): Base date and time
    Returns:
        Datetime object containing number of days elapsed
    """
    time_delta = cur_datetime - base_datetime
    time_to_days = (time_delta.seconds)/(24 * 60 * 60)
    return time_delta.days + time_to_days

# Dataset credits: https://fred.stlouisfed.org/series/UNRATE, 
# Federal Reserve Bank of St. Louis.

[docs]def unemployment(verbose=True):
    """ 
    Description: Checks if unemployment data exists, downloads if not.
        Dataset credits: https://fred.stlouisfed.org/series/UNRATE, 
        Federal Reserve Bank of St. Louis.

    Args:
        verbose (boolean): Specifies if download progress should be printed
    Returns:
        Dataframe containing Unemployment data
    """

    import datetime
    from urllib.request import urlretrieve
    tigercontrol_dir = get_tigercontrol_dir()
    path_csv = os.path.join(tigercontrol_dir, 'data/unemployment.csv')
    if os.path.exists(path_csv): # if file exists return csv
        return pd.read_csv(path_csv)


    day = datetime.date.today()
    for i in range(10): # try downloading 10 times
        day_string = day.isoformat()
        url_core = "https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23e1e9f0&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1168&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=UNRATE&scale=left&cosd=1948-01-01&coed=2019-06-01&line_color=%234572a7&link_values=false&line_style=solid&mark_type=none&mw=3&lw=2&ost=-99999&oet=99999&mma=0&fml=a&fq=Monthly&fam=avg&fgst=lin&fgsnd=2009-06-01&line_index=1&transformation=lin&vintage_date={}&revision_date={}&nd=1948-01-01"
        url_csv = url_core.format(day_string, day_string)

        try:
            urlretrieve(url_csv, path_csv)
            return pd.read_csv(path_csv)
        except Exception as e:
            if verbose:
                print("date {} download failed: {}".format(day, e))

        day = day - datetime.timedelta(days=1) # try with previous day

    # final attempt
    try:
        url_csv = "https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23e1e9f0&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1168&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=UNRATE&scale=left&cosd=1948-01-01&coed=2019-06-01&line_color=%234572a7&link_values=false&line_style=solid&mark_type=none&mw=3&lw=2&ost=-99999&oet=99999&mma=0&fml=a&fq=Monthly&fam=avg&fgst=lin&fgsnd=2009-06-01&line_index=1&transformation=lin&vintage_date=2019-07-19&revision_date=2019-07-19&nd=1948-01-01"

        urlretrieve(url_csv, path_csv)
        return pd.read_csv(path_csv)
    except Exception as e:
        raise RuntimeError("Error downloading URL: {}".format(e))


[docs]def uci_indoor(verbose=True):
    """
    Description: Checks if uci_indoor data exists, downloads if not. 
        Dataset credits: F. Zamora-Martínez, P. Romeu, P. Botella-Rocamora, J. Pardo, 
        On-line learning of indoor temperature forecasting models towards energy efficiency,
        Energy and Buildings, Volume 83, November 2014, Pages 162-172, ISSN 0378-7788

    Args:
        verbose (boolean): Specifies if download progress should be printed
    Returns:
        Dataframe containing uci_indoor data
    """

    tigercontrol_dir = get_tigercontrol_dir()
    url_uci_indoor = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00274/NEW-DATA.zip'
    path_uci_indoor_zip = os.path.join(tigercontrol_dir, 'data/uci_indoor.zip')
    path_uci_indoor_unzip = os.path.join(tigercontrol_dir, 'data/uci_indoor')
    path_uci_indoor_txt1 = os.path.join(tigercontrol_dir, 'data/uci_indoor/NEW-DATA-1.T15.txt')
    path_uci_indoor_csv = os.path.join(tigercontrol_dir, 'data/uci_indoor.csv')
    path_uci_indoor_cleaned = os.path.join(tigercontrol_dir, 'data/uci_indoor_cleaned.csv')

    # check if files have been downloaded before, else download
    if not os.path.exists(path_uci_indoor_cleaned):
        download(path_uci_indoor_zip, url_uci_indoor, verbose) # get files from online URL
        unzip(path_uci_indoor_zip)
        os.remove(path_uci_indoor_zip) # clean up - remove unnecessary files
        
        with open(path_uci_indoor_txt1, 'r') as txt_file:
            list_of_vecs = [line.split() for line in txt_file] # clean downloaded data
            list_of_vecs[0] = list_of_vecs[0][1:]
        with open(path_uci_indoor_csv, "w") as csv_file:
            writer = csv.writer(csv_file)
            writer.writerows(list_of_vecs)
        shutil.rmtree(path_uci_indoor_unzip) # clean up - remove unnecessary files

        df = pd.read_csv(path_uci_indoor_csv)
        base_datetime = to_datetime(df['1:Date'].iloc[0], df['2:Time'].iloc[0])

        def uci_datetime_converter(row):
            return datetime_to_daysElapsed(to_datetime(row['1:Date'],row['2:Time']), base_datetime)
        
        df['24:Day_Of_Week'] = df.apply(uci_datetime_converter, axis=1)
        with open(path_uci_indoor_csv,'r') as csvinput:
            with open(path_uci_indoor_cleaned, 'w') as csvoutput:
                writer = csv.writer(csvoutput, lineterminator='\n')
                reader = csv.reader(csvinput)
                r = 0
                appended_csv = [next(reader) + ['25:Days_Elapsed']]
                for row in reader:
                    row.append(df['24:Day_Of_Week'].iloc[r])
                    appended_csv.append(row)
                    r += 1
                writer.writerows(appended_csv)
        os.remove(path_uci_indoor_csv)
        
    df = pd.read_csv(path_uci_indoor_cleaned) # clean up - remove unnecessary files
    return df


[docs]def sp500(verbose=True):
    """
    Description: Checks if S&P500 data exists, downloads if not.

    Args:
        verbose (boolean): Specifies if download progress should be printed
    Returns:
        Dataframe containing S&P500 data
    """

    tigercontrol_dir = get_tigercontrol_dir()
    url_sp500_xls = 'http://www.cboe.com/micro/buywrite/dailypricehistory.xls'
    path_sp500_xls = os.path.join(tigercontrol_dir, 'data/sp500_xls.xls')
    path_sp500_txt = os.path.join(tigercontrol_dir, 'data/sp500_col.txt')
    path_sp500_csv = os.path.join(tigercontrol_dir, 'data/sp500.csv')

    # check if files have been downloaded before, else download
    if not os.path.exists(path_sp500_csv):
        download(path_sp500_xls, url_sp500_xls, verbose) # get files from online URL
        book = xlrd.open_workbook(path_sp500_xls)
        sh = book.sheet_by_index(0)
        sp500_col = open(path_sp500_txt, 'w')
        for r in range(5, 8197):
            date = datetime.datetime(*xlrd.xldate_as_tuple(sh.cell_value(r,0), book.datemode))
            sp500_col.write(str(date) + "," + str(sh.cell(r, 3).value)+"\n")
        sp500_col.close()
        with open(path_sp500_txt) as f: # clean downloaded data
            with open(path_sp500_csv,'w') as out:
                csv_out=csv.writer(out)
                csv_out.writerow(['date','value'])
                for x in f.readlines():
                    date_val_list = x.strip().split(',')
                    date_val_list[0] = (date_val_list[0].split(' '))[0]
                    csv_out.writerow(date_val_list)
        os.remove(path_sp500_xls) # clean up - remove unnecessary files
        os.remove(path_sp500_txt)
    return pd.read_csv(path_sp500_csv)


[docs]def crypto():
    """
    Description: Checks if cryptocurrency data exists, downloads if not.

    Args:
        None
    Returns:
        Dataframe containing cryptocurrency data
    """
    tigercontrol_dir = get_tigercontrol_dir()
    path_crypto_csv = os.path.join(tigercontrol_dir, 'data/crypto.csv')
    
    if not os.path.exists(path_crypto_csv):
        df = pd.read_csv('https://query.data.world/s/43quzwdjeh2zmghpdcgvgkppo6bvg7')
        dict_of_currency_dfs = {k: v for k, v in df.groupby('Currency')}
        bdf = dict_of_currency_dfs['bitcoin']
        bdf.to_csv(path_crypto_csv)
    return pd.read_csv(path_crypto_csv)


[docs]def enso(input_signals, include_month, output_signals, history, timeline):
    """
    Description: Transforms the ctrl_indices dataset into a format suitable for online learning.

    Args:
        input_signals (list of strings): signals used for prediction
        include_month (boolean): True if the month should be used as a feature,
                                 False otherwise
        output_signals (list of strings): signals we are trying to predict
        history (int): number of past observations used for prediction
        timeline (int/list of ints): the forecasting timeline(s)
    Returns:
        X (numpy.ndarray): Input Observations
        y (numpy.ndarray): Labels
    """

    ############################## GET DATA ###################################

    tigercontrol_dir = get_tigercontrol_dir()
    datapath = os.path.join(tigercontrol_dir, 'data/enso.csv')
    signals_pd = pd.read_csv(datapath)

    signal_length = signals_pd['nino34'].values.shape[0]

    # make timeline into np.array
    if(type(timeline) is int):
        timeline = [timeline]

    timeline = np.array(timeline)

    ################################ GET ONI ###################################

    ''' 0. Get nino34 signal for ONI'''
    nino34 = signals_pd['nino34'].values

    ''' 1. Get climatology '''
    clm = np.zeros(12)
    for month in range(12):
        section = [12 * i + month for i in range(signal_length // 12)]
        clm[month] = np.mean(nino34[section])


    ''' 2. Compute anomaly '''
    anm = np.array(nino34)
    for i in range(signal_length):
        anm[i] = nino34[i] - clm[i % 12]

    ''' 3. Get ONI '''
    oni = np.array(anm)
    m = 3
    for i in range(signal_length):
        oni[i] = np.mean(anm[max(0, (i - m + 1)) : min((i + 1), signal_length)])

    ########################## PREPARE INPUT/ OUTPUT SIGNALS ############################

    # input signals
    X_signals = np.zeros((signal_length, 0))

    for signal in input_signals:
        if(signal == 'oni'):
            new_signal = oni.reshape((signal_length, 1))
        else:
            new_signal = (signals_pd[signal].values).reshape((signal_length, 1))
        X_signals = np.append(X_signals, new_signal, axis = 1)

    if(include_month):
        new_signal = (np.arange(signal_length) % 12).reshape((signal_length, 1))
        X_signals = np.append(X_signals, new_signal, axis = 1)

    # output signals
    y_signals = np.zeros((signal_length, 0))

    for signal in output_signals:
        if(signal == 'oni'):
            new_signal = oni.reshape((signal_length, 1))
        else:
            new_signal = (signals_pd[signal].values).reshape((signal_length, 1))
        y_signals = np.append(y_signals, new_signal, axis = 1)

    ########################## CONVERT SIGNALS TO (X, y) #############################

    effective_length = signal_length - history - np.max(timeline)

    # Observations
    X = np.ndarray((effective_length, history, np.array(input_signals).shape[0] + include_month))

    for i in range(effective_length):
        X[i, 0:history, :] = X_signals[i:(i + history)]

    # if we use only one observation
    if(history == 1):
        X = X.reshape((effective_length, np.array(input_signals).shape[0] + include_month))

    # Labels
    y = np.ndarray((effective_length, timeline.shape[0], np.array(output_signals).shape[0]))

    n_t = 0
    for t in timeline:
        for i in range(effective_length):
            y[i, n_t, :] = y_signals[i + history + t - 1]
        n_t += 1

    # if we predict only one timeline
    if(timeline.shape[0] == 1):
        y = y.reshape((effective_length, np.array(output_signals).shape[0]))

    return (X, y)