#!/usr/bin/env python
"""
Module implementing the Data class that manages data for
it's associated PandasTable.
Created Jan 2014
Copyright (C) Damien Farrell
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
from types import *
import operator
import os, string, types, copy
import pickle
import numpy as np
import pandas as pd
from . import util
[docs]class TableModel(object):
"""A data model for the Table class that uses pandas
Args:
dataframe: pandas dataframe
rows: number of rows if empty table
columns: number of columns if empty table
"""
keywords = {'colors':'colors'}
def __init__(self, dataframe=None, rows=20, columns=5):
"""Constructor for table model. """
self.initialiseFields()
self.setup(dataframe, rows, columns)
return
[docs] def setup(self, dataframe, rows=20, columns=5):
"""Create table model"""
if not dataframe is None:
self.df = dataframe
else:
colnames = list(string.ascii_lowercase[:columns])
self.df = pd.DataFrame(index=range(rows),columns=colnames)
#self.df = self.getSampleData()
#self.reclist = self.df.index # not needed now?
return
[docs] @classmethod
def getSampleData(self, rows=400, cols=5, n=2):
"""Generate sample data
Args:
rows: no. of rows
cols: columns
n: length of column names
"""
import random
s = string.ascii_lowercase
def genstr(n=2):
return ''.join(random.choice(s) for i in range(n))
maxrows = 5e6
if rows>maxrows:
rows=maxrows
if cols>1e5:
cols=int(1e5)
n=2
if cols>100: n=3
colnames = [genstr(n) for i in range(cols)]
coldata = [np.random.normal(x,1,rows) for x in np.random.normal(5,3,cols)]
n = np.array(coldata).T
df = pd.DataFrame(n, columns=colnames)
col1 = colnames[0]
col2 = colnames[1]
df[col2] = df[col1]*np.random.normal(.8, .2, len(df))
df = np.round(df, 3)
cats = ['low','medium','high','very high']
df['label'] = pd.cut(df[col1], bins=4, labels=cats).astype(str)
#df['label'] = df.label.cat.as_ordered()
#don't add date if rows too large
if rows<2e6:
df['date'] = pd.date_range('1/1/2016', periods=rows, freq='H')
return df
[docs] @classmethod
def getIrisData(self):
"""Get iris dataset"""
path = os.path.dirname(__file__)
cols = ['sepal length','sepal width','petal length','petal width','class']
df = pd.read_csv(os.path.join(path,'datasets','iris.data'),names=cols)
return df
[docs] @classmethod
def getStackedData(self):
"""Get a dataframe to pivot test"""
import pandas.util.testing as tm; tm.N = 4
frame = tm.makeTimeDataFrame()
N, K = frame.shape
data = {'value' : frame.values.ravel('F'),
'variable' : np.asarray(frame.columns).repeat(N),
'date' : np.tile(np.asarray(frame.index), K)}
return pd.DataFrame(data, columns=['date', 'variable', 'value'])
[docs] def initialiseFields(self):
"""Create meta data fields"""
self.meta = {}
#self.columnwidths = {} #used to store col widths
return
[docs] def save(self, filename):
"""Save dataframe"""
ftype = os.path.splitext(filename)[1]
if ftype == '.pickle':
self.df.to_pickle(filename)
elif ftype == '.xls':
self.df.to_excel(filename)
elif ftype == '.csv':
self.df.to_csv(filename)
#elif ftype == '.html':
# self.df.to_html(filename)
return
[docs] def load(self, filename, filetype=None):
"""Load file, if no filetype given assume it's pickle format"""
if filetype == '.mpk':
self.df = pd.read_msgpack(filename)
else:
self.df = pd.read_pickle(filename)
#print (len(self.df))
return
[docs] def getlongestEntry(self, colindex, n=500):
"""Get the longest string in the column for determining width. Just uses the first
n rows for speed"""
df = self.df
col = df.columns[colindex]
try:
if df.dtypes[col] == 'float64':
c = df[col][:n].round(3)
else:
c = df[col][:n]
except:
return 1
longest = c.astype('object').astype('str').str.len().max()
if np.isnan(longest):
return 1
return longest
[docs] def getRecordAtRow(self, rowindex):
"""Get the entire record at the specifed row"""
record = self.df.iloc[rowindex]
return record
[docs] def moveColumn(self, oldindex, newindex):
"""Changes the order of columns"""
df = self.df
cols = list(df.columns)
name = cols[oldindex]
del cols[oldindex]
cols.insert(newindex, name)
self.df = df[cols]
return
[docs] def autoAddRows(self, num):
"""Add n rows to end of dataframe. Will create rows with index starting
from highest previous row count"""
df = self.df
if len(df) == 0:
self.df = pd.DataFrame(pd.Series(range(num)))
#print (df)
return
try:
ind = self.df.index.max()+1
except:
ind = len(df)+1
new = pd.DataFrame(np.nan, index=range(ind,ind+num), columns=df.columns)
self.df = pd.concat([df, new])
return
[docs] def insertRow(self, row):
"""Inserts a row at the required index by append/concat"""
df = self.df
a, b = df[:row], df[row:]
idx = len(df)+1
new = pd.DataFrame(np.nan,index=[idx],columns=df.columns)
a = pd.concat([a,new])
self.df = pd.concat([a,b])
return idx
[docs] def deleteRow(self, row, unique=True):
"""Delete a row"""
self.deleteRows([row], unique)
return
[docs] def deleteRows(self, rowlist=None, unique=True):
"""Delete multiple or all rows"""
df = self.df
if unique == True:
rows = list(set(range(len(df))) - set(rowlist))
self.df = df.iloc[rows]
else:
df.drop(df.index[rowlist],inplace=True)
return
[docs] def addColumn(self, colname=None, dtype=None, data=None):
"""Add a column"""
if data is None:
data = pd.Series(dtype=dtype)
self.df[colname] = data
return
[docs] def deleteColumn(self, colindex):
"""delete a column"""
df = self.df
colname = df.columns[colindex]
df.drop([colname], axis=1, inplace=True)
return
[docs] def deleteColumns(self, cols=None):
"""Remove all cols or list provided"""
df = self.df
colnames = df.columns[cols]
df.drop(colnames, axis=1, inplace=True)
return
[docs] def deleteCells(self, rows, cols):
self.df.iloc[rows,cols] = np.nan
return
[docs] def resetIndex(self, drop=False):
"""Reset index behaviour"""
df = self.df
df.reset_index(drop=drop,inplace=True)
return
[docs] def setindex(self, colindex):
"""Index setting behaviour"""
df = self.df
colnames = list(df.columns[colindex])
indnames = df.index.names
if indnames[0] != None:
df.reset_index(inplace=True)
df.set_index(colnames, inplace=True)
return
[docs] def copyIndex(self):
"""Copy index to a column"""
df = self.df
name = df.index.name
if name == None: name='index'
df[name] = df.index#.astype('object')
return
[docs] def groupby(self, cols):
"""Group by cols"""
df = self.df
colnames = df.columns[cols]
grps = df.groupby(colnames)
return grps
[docs] def getColumnType(self, columnIndex):
"""Get the column type"""
coltype = self.df.dtypes[columnIndex]
return coltype
[docs] def getColumnCount(self):
"""Returns the number of columns in the data model"""
return len(self.df.columns)
[docs] def getColumnName(self, columnIndex):
"""Returns the name of the given column by columnIndex"""
try:
return str(self.df.columns[columnIndex])
except:
return self.df.columns[columnIndex].encode('ascii', 'ignore')
[docs] def getRowCount(self):
"""Returns the number of rows in the table model."""
return len(self.df)
[docs] def getValueAt(self, row, col):
"""Returns the cell value at location specified
by columnIndex and rowIndex."""
df = self.df
value = self.df.iloc[row,col]
if type(value) is float and np.isnan(value):
return ''
return value
[docs] def setValueAt(self, value, row, col, df=None):
"""Change dataframe according to row/col numbers. You can
also pass an arbitrary dataframe here."""
if df is None:
df = self.df
rowindex = df.iloc[row].name
colindex = df.columns[col]
#print (df.loc[rowindex,colindex])
if value == '':
value = np.nan
dtype = self.df.dtypes[col]
#try to cast to column type
try:
if dtype == 'float64':
value = float(value)
elif dtype == 'int':
value = int(value)
elif dtype == 'datetime64[ns]':
value = pd.to_datetime(value)
except Exception as e:
print (e)
if df.index.is_unique is True:
df.loc[rowindex,colindex] = value
else:
#we cannot use index if not unique
df.iloc[row,col] = value
return
[docs] def transpose(self):
"""Transpose dataframe"""
df = self.df
rows = df.index
df = df.transpose()
df.reset_index()
if util.check_multiindex(df.columns) != 1:
try:
df.columns = df.columns.astype(str)
except:
pass
try:
self.df = df.infer_objects()
except:
self.df = df.convert_objects()
#self.columnwidths = {}
return
[docs] def query(self):
return
[docs] def filterby(self):
import filtering
funcs = filtering.operatornames
floatops = ['=','>','<']
func = funcs[op]
return
def __repr__(self):
return 'Table Model with %s rows' %len(self.df)