Source code for pandastable.data

#!/usr/bin/env python
"""
    Module implementing the Data class that manages data for
    it's associated PandasTable.

    Created Jan 2014
    Copyright (C) Damien Farrell

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
    as published by the Free Software Foundation; either version 2
    of the License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
"""

from types import *
import operator
import os, string, types, copy
import pickle
import numpy as np
import pandas as pd
from . import util

[docs]class TableModel(object): """A data model for the Table class that uses pandas Args: dataframe: pandas dataframe rows: number of rows if empty table columns: number of columns if empty table """ keywords = {'colors':'colors'} def __init__(self, dataframe=None, rows=20, columns=5): """Constructor for table model. """ self.initialiseFields() self.setup(dataframe, rows, columns) return
[docs] def setup(self, dataframe, rows=20, columns=5): """Create table model""" if not dataframe is None: self.df = dataframe else: colnames = list(string.ascii_lowercase[:columns]) self.df = pd.DataFrame(index=range(rows),columns=colnames) #self.df = self.getSampleData() #self.reclist = self.df.index # not needed now? return
[docs] @classmethod def getSampleData(self, rows=400, cols=5, n=2): """Generate sample data Args: rows: no. of rows cols: columns n: length of column names """ import random s = string.ascii_lowercase def genstr(n=2): return ''.join(random.choice(s) for i in range(n)) maxrows = 5e6 if rows>maxrows: rows=maxrows if cols>1e5: cols=int(1e5) n=2 if cols>100: n=3 colnames = [genstr(n) for i in range(cols)] coldata = [np.random.normal(x,1,rows) for x in np.random.normal(5,3,cols)] n = np.array(coldata).T df = pd.DataFrame(n, columns=colnames) col1 = colnames[0] col2 = colnames[1] df[col2] = df[col1]*np.random.normal(.8, .2, len(df)) df = np.round(df, 3) cats = ['low','medium','high','very high'] df['label'] = pd.cut(df[col1], bins=4, labels=cats).astype(str) #df['label'] = df.label.cat.as_ordered() #don't add date if rows too large if rows<2e6: df['date'] = pd.date_range('1/1/2016', periods=rows, freq='H') return df
[docs] @classmethod def getIrisData(self): """Get iris dataset""" path = os.path.dirname(__file__) cols = ['sepal length','sepal width','petal length','petal width','class'] df = pd.read_csv(os.path.join(path,'datasets','iris.data'),names=cols) return df
[docs] @classmethod def getStackedData(self): """Get a dataframe to pivot test""" import pandas.util.testing as tm; tm.N = 4 frame = tm.makeTimeDataFrame() N, K = frame.shape data = {'value' : frame.values.ravel('F'), 'variable' : np.asarray(frame.columns).repeat(N), 'date' : np.tile(np.asarray(frame.index), K)} return pd.DataFrame(data, columns=['date', 'variable', 'value'])
[docs] def initialiseFields(self): """Create meta data fields""" self.meta = {} #self.columnwidths = {} #used to store col widths return
[docs] def save(self, filename): """Save dataframe""" ftype = os.path.splitext(filename)[1] if ftype == '.pickle': self.df.to_pickle(filename) elif ftype == '.xls': self.df.to_excel(filename) elif ftype == '.csv': self.df.to_csv(filename) #elif ftype == '.html': # self.df.to_html(filename) return
[docs] def load(self, filename, filetype=None): """Load file, if no filetype given assume it's pickle format""" if filetype == '.mpk': self.df = pd.read_msgpack(filename) else: self.df = pd.read_pickle(filename) #print (len(self.df)) return
[docs] def getlongestEntry(self, colindex, n=500): """Get the longest string in the column for determining width. Just uses the first n rows for speed""" df = self.df col = df.columns[colindex] try: if df.dtypes[col] == 'float64': c = df[col][:n].round(3) else: c = df[col][:n] except: return 1 longest = c.astype('object').astype('str').str.len().max() if np.isnan(longest): return 1 return longest
[docs] def getRecordAtRow(self, rowindex): """Get the entire record at the specifed row""" record = self.df.iloc[rowindex] return record
[docs] def moveColumn(self, oldindex, newindex): """Changes the order of columns""" df = self.df cols = list(df.columns) name = cols[oldindex] del cols[oldindex] cols.insert(newindex, name) self.df = df[cols] return
[docs] def autoAddRows(self, num): """Add n rows to end of dataframe. Will create rows with index starting from highest previous row count""" df = self.df if len(df) == 0: self.df = pd.DataFrame(pd.Series(range(num))) #print (df) return try: ind = self.df.index.max()+1 except: ind = len(df)+1 new = pd.DataFrame(np.nan, index=range(ind,ind+num), columns=df.columns) self.df = pd.concat([df, new]) return
[docs] def insertRow(self, row): """Inserts a row at the required index by append/concat""" df = self.df a, b = df[:row], df[row:] idx = len(df)+1 new = pd.DataFrame(np.nan,index=[idx],columns=df.columns) a = pd.concat([a,new]) self.df = pd.concat([a,b]) return idx
[docs] def deleteRow(self, row, unique=True): """Delete a row""" self.deleteRows([row], unique) return
[docs] def deleteRows(self, rowlist=None, unique=True): """Delete multiple or all rows""" df = self.df if unique == True: rows = list(set(range(len(df))) - set(rowlist)) self.df = df.iloc[rows] else: df.drop(df.index[rowlist],inplace=True) return
[docs] def addColumn(self, colname=None, dtype=None, data=None): """Add a column""" if data is None: data = pd.Series(dtype=dtype) self.df[colname] = data return
[docs] def deleteColumn(self, colindex): """delete a column""" df = self.df colname = df.columns[colindex] df.drop([colname], axis=1, inplace=True) return
[docs] def deleteColumns(self, cols=None): """Remove all cols or list provided""" df = self.df colnames = df.columns[cols] df.drop(colnames, axis=1, inplace=True) return
[docs] def deleteCells(self, rows, cols): self.df.iloc[rows,cols] = np.nan return
[docs] def resetIndex(self, drop=False): """Reset index behaviour""" df = self.df df.reset_index(drop=drop,inplace=True) return
[docs] def setindex(self, colindex): """Index setting behaviour""" df = self.df colnames = list(df.columns[colindex]) indnames = df.index.names if indnames[0] != None: df.reset_index(inplace=True) df.set_index(colnames, inplace=True) return
[docs] def copyIndex(self): """Copy index to a column""" df = self.df name = df.index.name if name == None: name='index' df[name] = df.index#.astype('object') return
[docs] def groupby(self, cols): """Group by cols""" df = self.df colnames = df.columns[cols] grps = df.groupby(colnames) return grps
[docs] def getColumnType(self, columnIndex): """Get the column type""" coltype = self.df.dtypes[columnIndex] return coltype
[docs] def getColumnCount(self): """Returns the number of columns in the data model""" return len(self.df.columns)
[docs] def getColumnName(self, columnIndex): """Returns the name of the given column by columnIndex""" try: return str(self.df.columns[columnIndex]) except: return self.df.columns[columnIndex].encode('ascii', 'ignore')
[docs] def getRowCount(self): """Returns the number of rows in the table model.""" return len(self.df)
[docs] def getValueAt(self, row, col): """Returns the cell value at location specified by columnIndex and rowIndex.""" df = self.df value = self.df.iloc[row,col] if type(value) is float and np.isnan(value): return '' return value
[docs] def setValueAt(self, value, row, col, df=None): """Change dataframe according to row/col numbers. You can also pass an arbitrary dataframe here.""" if df is None: df = self.df rowindex = df.iloc[row].name colindex = df.columns[col] #print (df.loc[rowindex,colindex]) if value == '': value = np.nan dtype = self.df.dtypes[col] #try to cast to column type try: if dtype == 'float64': value = float(value) elif dtype == 'int': value = int(value) elif dtype == 'datetime64[ns]': value = pd.to_datetime(value) except Exception as e: print (e) if df.index.is_unique is True: df.loc[rowindex,colindex] = value else: #we cannot use index if not unique df.iloc[row,col] = value return
[docs] def transpose(self): """Transpose dataframe""" df = self.df rows = df.index df = df.transpose() df.reset_index() if util.check_multiindex(df.columns) != 1: try: df.columns = df.columns.astype(str) except: pass try: self.df = df.infer_objects() except: self.df = df.convert_objects() #self.columnwidths = {} return
[docs] def query(self): return
[docs] def filterby(self): import filtering funcs = filtering.operatornames floatops = ['=','>','<'] func = funcs[op] return
def __repr__(self): return 'Table Model with %s rows' %len(self.df)