"""
"mini pandas.DataFrame"
Table class with Excel + CSV I/O, easy access to columns, HTML output, and much more.
"""
__author__ = "Philippe Guglielmetti"
__copyright__ = "Copyright 2013, Philippe Guglielmetti"
__credits__ = []
__license__ = "LGPL"
import logging
import csv
import itertools
import codecs
import json
import collections
from datetime import datetime, date, time, timedelta
from Goulib import itertools2, markup, datetime2
try:
from lxml import etree as ElementTree
defaultparser = ElementTree.HTMLParser
Element = ElementTree._Element
except ImportError: #TODO: find why it happens in LiClipse
from xml.etree import ElementTree
Element = ElementTree.Element
[docs]def attr(args):
res = ''
for key, val in args.items():
k = "class" if key == "_class" else key
res += ' %s="%s"' % (k, val)
return res
[docs]class Cell(object):
"""Table cell with HTML attributes"""
[docs] def __init__(self, data=None, align=None, fmt=None, tag=None, style={}):
"""
:param data: cell value(s) of any type
:param align: string for HTML align attribute
:param fmt: format string applied applied to data
:param tag: called to build each cell. defaults to 'td'
:param style: dict or string for HTML style attribute
"""
if isinstance(data, Element):
tag = data.tag
assert(tag in ('td', 'th'))
def _recurse(data):
"grab all possible text from the cell content"
if data is None:
return ''
s = ''.join(_recurse(x) for x in data.getchildren())
if data.text:
s = data.text+s
if data.tail:
s = s+data.tail
return s
data = Cell.read(_recurse(data))
if isinstance(data, str):
data = data.lstrip().rstrip() # remove spaces, but also trailing \r\n
self.data = data
self.align = align
self.fmt = fmt
self.tag = tag if tag else 'td'
if not isinstance(style, dict):
style = markup.style_str2dict(style)
self.style = style
[docs] def __str__(self):
return str(self.data)
def _repr_html_(self):
return self.html()
[docs] @staticmethod
def read(x):
"""interprets x as int, float, string or None"""
try:
x = float(x) # works for x in unicode
xi = int(x) # does not work if x is floating point in unicode (?)
if xi == x:
x = xi
except:
if x == '':
x = None
return x
[docs] def html(self, **kwargs):
""":return: string HTML formatted cell:
* if data is int, default align="right"
* if data is float, default align="right" and fmt='%0.2f'
* if data is :class:`~datetime.timedelta`, align = "right" and formatting is done by :func:`datetime2.strftimedelta`
"""
args = {}
args.update(kwargs) # copy needed to avoid side effects
v = self.data
f = self.fmt
style = args.get('style', {})
if isinstance(style, str):
style = markup.style_str2dict(style)
# create style dict by merging default Cell style + parameters
# http://stackoverflow.com/questions/9819602/union-of-dict-objects-in-python
style = dict(self.style, **style)
if hasattr(v, '_repr_html_'):
try:
v = v._repr_html_()
except Exception as e:
v = 'ERROR : %s _repr_html_ failed : %s' % (v, e)
elif 'text-align' not in style: # HTML 4 and before
a = args.pop('align', self.align)
if isinstance(v, int):
if not a:
a = "right"
elif isinstance(v, float):
if not a:
a = "right"
if not f:
f = '%0.2f'
v = f % v
f = None # don't reformat below
elif isinstance(v, date):
if not a:
a = "right"
if not f:
f = '%Y-%m-%d'
v = v.strftime(f)
f = None # don't reformat below
elif isinstance(v, timedelta):
if not a:
a = "right"
if not f:
f = '%H:%M:%S'
v = datetime2.strftimedelta(v, f)
f = None # don't reformat below
if a:
style['text-align'] = a
if v is None or v == '':
v = " " # for IE8
else:
v = f % v if f else str(v)
if v[0] == '$' and v[-1] == '$': # LaTeX formula : avoid line wrap
v = '$'+v+'$'
if style:
args['style'] = style
return markup.tag(self.tag, v, **args)
[docs]class Row(object):
"""Table row with HTML attributes"""
[docs] def __init__(self, data, align=None, fmt=None, tag=None, style={}):
"""
:param data: (list of) cell value(s) of any type
:param align: (list of) string for HTML align attribute
:param fmt: (list of) format string applied applied to data
:param tag: (list of) tags called to build each cell. defaults to 'td'
:param style: (list of) dict or string for HTML style attribute
"""
if not itertools2.isiterable(data):
data = [data]
data = list(data) # to make it mutable
# ensure params have the same length as data
if not isinstance(style, list):
style = [style]
style = style+[None]*(len(data)-len(style))
if not isinstance(align, list):
align = [align]
align = align+[None]*(len(data)-len(align))
if not isinstance(fmt, list):
fmt = [fmt]
fmt = fmt+[None]*(len(data)-len(fmt))
if not tag:
tag = 'td'
if not isinstance(tag, list):
tag = [tag]*(len(data)) # make a full row, in case it's a 'th'
# fill the row with None, which will be 'td's
tag = tag+[None]*(len(data)-len(fmt))
for i, cell in enumerate(data):
if not isinstance(cell, Cell):
cell = Cell(cell, align[i], fmt[i], tag[i], style[i])
else:
pass # ignore attribs for now
data[i] = cell
self.data = data
[docs] def __str__(self):
return str(self.data)
def _repr_html_(self):
return self.html()
[docs] def html(self, cell_args={}, **kwargs):
"""return in HTML format"""
res = ''
for cell in self.data:
res += cell.html(**cell_args)
return markup.tag('tr', res, **kwargs)
[docs]class Table(list):
"""Table class with CSV I/O, easy access to columns, HTML output"""
[docs] def __init__(self, data=[], **kwargs):
"""inits a table, optionally by reading a Excel, csv or html file
:param data: list of list of cells, or string as filename
:param titles: optional list of strings used as column id
:param footer: optional list of functions used as column reducers
"""
try:
self.titles = data.titles
except AttributeError:
self.titles = kwargs.pop('titles', [])
try:
self.footer = data.footer
except AttributeError:
self.footer = kwargs.pop('footer', [])
filename = None
if isinstance(data, str):
filename = data
data = []
else: # ensure data is 2D and mutable
if isinstance(data, dict):
data = data.values()
for row in data:
if not itertools2.isiterable(row): # build a column
row = [row]
self.append(row)
if filename:
self.load(filename, **kwargs)
[docs] def __repr__(self):
""":return: repr string of titles+5 first lines"""
return '%s(len=%d,titles=%s,data=%s)' % (self.__class__, len(self), self.titles, self[:5])
[docs] def __str__(self):
""":return: string of full tables with linefeeds"""
res = ''
if self.titles:
res += str(self.titles)+'\n'
for line in self:
res += str(line)+'\n'
return res
def _repr_html_(self):
return self.html()
[docs] def html(self, head=None, foot=None, **kwargs):
"""HTML representation of table
:param head: optional column headers, .titles by default
:param foot: optional column footers, .footer by default
:param style: style for the table
:param colstyle: list of dict of style attributes for each column
:param kwargs: optional parameters passed along to tag('table'...
except:
* start=optional start row
* stop=optional end row
used to display a subset of lines. in this case rows with '...' cells
are displayed before and/or after the lines
:return: string HTML representation of table
"""
def TR(data, align=None, fmt=None, tag=None, style={}):
res = ''
row = Row(data=data, align=align, fmt=fmt, tag=tag, style=style)
res += row.html()+'\n'
return res
def THEAD(data, fmt=None, style={}):
res = "<thead>\n"
res += TR(data=data, fmt=fmt, tag='th', style=style)
res += "</thead>\n"
return res
def TFOOT(data, fmt=None, style={}):
res = "<tfoot>\n"
res += TR(data=data, fmt=fmt, tag='th', style=style)
res += "</tfoot>\n"
return res
res = ''
colstyle = kwargs.pop('colstyle', None)
if head is None:
head = self.titles
if head:
res += THEAD(head)
start = kwargs.pop('start', 0)
stop = kwargs.pop('stop', len(self))
if start != 0:
res += TR(['...']*self.ncols(), style=colstyle)
for row in self[start:stop]:
res += TR(row, style=colstyle)
if stop != -1 and stop < len(self):
res += TR(['...']*self.ncols(), style=colstyle)
if foot is None:
foot = self.footer
if foot:
res += TFOOT(foot)
# kwargs['style'] = kwargs.pop('style', {'table-layout': 'fixed'}) # to prevent wrapping
return markup.tag('table', res, **kwargs)+'\n'
[docs] def load(self, filename, **kwargs):
if self.titles: # explicitly set
l = kwargs.setdefault('titles_line', 0)
kwargs.setdefault('data_line', l+1)
else: # read titles from the file
l = kwargs.setdefault('titles_line', 1)
kwargs.setdefault('data_line', l+1)
ext = filename.split('.')[-1].lower()
if ext in ('xls', 'xlsx'):
self.read_xls(filename, **kwargs)
elif ext in ('htm', 'html'):
self.read_html(filename, **kwargs)
elif ext in ('json'):
self.read_json(filename, **kwargs)
else: # try ...
self.read_csv(filename, **kwargs)
return self # to allow chaining
[docs] def read_element(self, element, **kwargs):
"""read table from a DOM element.
:Warning: drops all formatting
"""
titles_line = kwargs.pop('titles_line', 1)-1
data_line = kwargs.pop('data_line', 2)-1
line = 0
head = element.find('thead')
if head is not None:
for row in head.findall('tr'):
if line == titles_line:
self.titles = [cell.data for cell in Row(row).data]
line = line+1
body = element.find('tbody')
if body is None:
body = element
for row in body.findall('tr'):
data = [cell.data for cell in Row(row).data]
if not data:
continue # skip empty lines
if not self.titles and line == titles_line:
self.titles = data
elif line >= data_line:
self.append(data)
line = line+1
return self
[docs] def read_html(self, filename, **kwargs):
"""read first table in HTML file
"""
parser = kwargs.get('parser', defaultparser)
element = ElementTree.parse(filename, parser()).getroot()
try:
if element.tag != 'table': # file contains table as topmost tag
element = element.find('.//table') # find first table
except AttributeError:
pass
if element is None:
raise LookupError('no table found in file')
self.read_element(element, **kwargs)
return self
[docs] def read_json(self, filename, **kwargs):
"""appends a json file made of lines dictionaries"""
with open(filename, 'r') as file:
for row in json.load(file):
self.append(row)
return self
[docs] def read_xls(self, filename, **kwargs):
"""appends an Excel table"""
titles_line = kwargs.pop('titles_line', 1)-1
data_line = kwargs.pop('data_line', 2)-1
import xlrd
wb = xlrd.open_workbook(filename)
sheet = kwargs.get('sheet', 0)
if isinstance(sheet, str):
s = wb.sheet_by_name(sheet)
else:
s = wb.sheet_by_index(sheet)
for i in range(s.nrows):
line = [Cell.read(s.cell(i, j).value) for j in range(s.ncols)]
if i == titles_line:
self.titles = line
elif i >= data_line:
self.append(line)
return self
[docs] def read_csv(self, filename, **kwargs):
"""appends a .csv or similar file to the table"""
titles_line = kwargs.pop('titles_line', 1)-1
data_line = kwargs.pop('data_line', 2)-1
dialect = kwargs.setdefault('dialect', csv.excel)
delimiter = kwargs.setdefault('delimiter', ';')
# must be iso-8859-15 in some cases
encoding = kwargs.pop('encoding', 'utf-8')
errors = kwargs.pop('errors', 'strict')
def reader(): # version for Python 3
with open(filename, 'rt', errors=errors, encoding=encoding) as f:
csv_reader = csv.reader(f, **kwargs)
for row in csv_reader:
yield row
for i, row in enumerate(reader()):
if i == titles_line: # titles can have no left/right spaces
self.titles = [Cell.read(x) for x in row]
elif i >= data_line:
line = [Cell.read(x) for x in row]
if line != [None]: # strange last line sometimes ...
self.append(line)
return self
[docs] def save(self, filename, **kwargs):
ext = filename.split('.')[-1].lower()
if ext in ('xls', 'xlsx'):
self.write_xlsx(filename, **kwargs)
elif ext in ('htm', 'html'):
with open(filename, 'w') as file:
file.write(self.html(**kwargs))
elif ext in ('json'):
with open(filename, 'w') as file:
file.write(self.json(**kwargs))
else: # try ...
self.write_csv(filename, **kwargs)
return self # to allow chaining
[docs] def write_xlsx(self, filename, **kwargs):
import xlsxwriter
workbook = xlsxwriter.Workbook(filename)
worksheet = workbook.add_worksheet()
df = workbook.add_format({'num_format': 'dd/mm/yyyy'})
tf = workbook.add_format({'num_format': 'hh:mm:ss'})
dtf = workbook.add_format({'num_format': 'dd/mm/yyyy hh:mm:ss'})
def writerow(i, line):
for j, s in enumerate(line):
if isinstance(s, datetime):
worksheet.write_datetime(i, j, s, dtf)
elif isinstance(s, date):
worksheet.write_datetime(i, j, s, df)
elif isinstance(s, (time, timedelta)):
worksheet.write_datetime(i, j, s, tf)
else:
worksheet.write(i, j, s)
writerow(0, self.titles)
for i, row in enumerate(self):
writerow(i+1, row)
workbook.close()
return self
[docs] def json(self, **kwargs):
"""
:return: string JSON representation of table
"""
def json_serial(obj):
"""JSON serializer for objects not serializable by default json code"""
if hasattr(obj, 'isoformat'):
return obj.isoformat()
try:
return str(obj)
except Exception:
pass
raise TypeError("Type %s not serializable" % (type(obj)))
array = [self.rowasdict(i) for i in range(len(self))]
kwargs.setdefault('default', json_serial)
return json.dumps(array, **kwargs)
[docs] def write_csv(self, filename, **kwargs):
""" write the table in Excel csv format, optionally transposed"""
dialect = kwargs.get('dialect', 'excel')
delimiter = kwargs.get('delimiter', ';')
encoding = kwargs.pop('encoding', 'utf-8') # was iso-8859-15 earlier
empty = ''.encode(encoding)
f = open(filename, 'w', newline='', encoding=encoding)
def _encode(line):
res = []
for s in line:
res.append(s)
return res
writer = csv.writer(f, dialect=dialect, delimiter=delimiter)
if self.titles:
s = _encode(self.titles)
writer.writerow(s)
for line in self:
s = _encode(line)
writer.writerow(s)
f.close()
return self
[docs] def __eq__(self, other):
"""compare 2 Tables contents, mainly for tests"""
if self.titles != other.titles:
return False
if len(self) != len(other):
return False
for i in range(len(self)):
if self[i] != other[i]:
return False
return True
[docs] def ncols(self):
"""
:return: number of columns, ignoring title
"""
return max(map(len, self))
[docs] def find_col(self, title):
"""finds a column from a part of the title"""
title = title.lower()
for i, c in enumerate(self.titles):
if c.lower().find(title) >= 0:
return i
return None
def _i(self, column):
'''column index'''
if isinstance(column, int):
return column
try:
return self.titles.index(column)
except ValueError:
return None
[docs] def icol(self, column):
"""iterates a column"""
i = self._i(column)
for row in self:
try:
yield row[i]
except:
yield None
[docs] def col(self, column, title=False):
i = self._i(column)
res = [x for x in self.icol(i)]
if title:
res = [self.titles[i]]+res
return res
[docs] def cols(self, title=False):
"""iterator through columns"""
for i in range(self.ncols()):
yield self.col(i, title)
[docs] def transpose(self, titles_column=0):
"""transpose table
:param: titles_column
:return: Table where rows are made from self's columns and vice-versa
"""
res = Table()
for i, row in enumerate(self.cols(self.titles)):
if i == titles_column:
res.titles = row
else:
res.append(row)
return res
[docs] def index(self, value, column=0):
"""
:return: int row number of first line where column contains value
"""
for i, v in enumerate(self.icol(column)):
if v == value:
return i
return None
[docs] def __getitem__(self, n):
try:
c = self._i(n[1])
except TypeError:
return super(Table, self).__getitem__(n)
else:
rows = super(Table, self).__getitem__(n[0])
if not isinstance(n[0], slice):
return rows[c]
return [row[c] for row in rows]
[docs] def get(self, row, col):
return self[row, col]
[docs] def set(self, row, col, value):
col = self._i(col)
if row >= len(self):
self.extend([list()]*(1+row-len(self)))
if col >= len(self[row]):
self[row].extend([None]*(1+col-len(self[row])))
self[row][col] = value
[docs] def setcol(self, col, value, i=0):
"""set column values
:param col: int or string column index
:param value: single value assigned to whole column or iterable assigned to each cell
:param i: optional int : index of first row to assign
"""
j = self._i(col)
if itertools2.isiterable(value):
for v in value:
self.set(i, j, v)
i += 1
else:
for i in range(i, len(self)):
self.set(i, j, value)
[docs] def append(self, line):
''' appends a line to table
:param line: can be either:
* a list
* a dict or column names:values
'''
if isinstance(line, dict):
r = len(self) # row number
for k, v in line.items():
i = self._i(k)
if i is None: # column doesn't exist:
i = len(self.titles)
self.titles.append(k)
self.set(r, i, v)
else:
list.append(self, list(line))
return self
[docs] def addcol(self, title, val=None, i=0):
'''add column to the right'''
col = len(self.titles)
self.titles.append(title)
if not itertools2.isiterable(val):
val = [val]*(len(self)-i)
for v in val:
self.set(i, col, v)
i += 1
return self
[docs] def sort(self, by, reverse=False):
'''sort by column'''
i = self._i(by)
if isinstance(i, int):
list.sort(self, key=lambda x: x[i], reverse=reverse)
else:
list.sort(i, reverse=reverse)
[docs] def rowasdict(self, i):
''' returns a line as a dict '''
return collections.OrderedDict(zip(self.titles, self[i]))
[docs] def asdict(self):
for i in range(len(self)):
yield self.rowasdict(i)
[docs] def groupby_gen(self, by, sort=True, removecol=True):
"""generates subtables
"""
i = self._i(by)
t = self.titles
if removecol:
t = t[:i]+t[i+1:]
if sort:
self.sort(i)
else:
pass # groupby will group CONSECUTIVE lines with same i, so entries at bottom of table will replace the earlier entries in dict
for k, g in itertools.groupby(self, key=lambda x: x[i]):
if removecol:
r = Table(titles=t, data=[a[:i]+a[i+1:] for a in g])
else:
r = Table(titles=t, data=list(g))
yield k, r
[docs] def groupby(self, by, sort=True, removecol=True):
""" ordered dictionary of subtables
"""
return collections.OrderedDict(
(k, t) for (k, t) in self.groupby_gen(by, sort, removecol)
)
[docs] def hierarchy(self, by='Level',
# creates an object from a line
factory=lambda row: (row, []),
# creates a parend/child relation between x and y. raw is also available (for qty)
linkfct=lambda x, y, row: x[1].append(y)
):
'''builds a structure from a table containing a "level" column'''
res = []
i = self._i(by)
stack = []
for row in self:
obj = factory(row)
level = row[i]
if level == 1:
res.append(obj)
while level <= len(stack):
stack.pop()
if stack:
linkfct(stack[-1], obj, row)
stack.append(obj)
return res
[docs] def applyf(self, by, f, skiperrors=False):
""" apply a function to a column
:param by: column name of number
:param f: function of the form lambda cell:content
:param skiperrors: bool. if True, errors while running f are ignored
:return: bool True if ok, False if skiperrors==True and conversion failed
"""
res = True
i = self._i(by)
for row in self:
try:
x = row[i]
row[i] = f(x)
except Exception as e:
if not skiperrors:
# TODO: change message to ('could not applyf to %s'%x) for Py2+3
raise
res = False
return res
def _datetimeformat(self, by, fmt, function, skiperrors):
"""convert a column to a date, time or datetime
:param by: column name of number
:param fmt: string defining format, or list of formats to try one by one
:param function: function to call
:param skiperrors: bool. if True, conversion errors are ignored
:return: bool True if ok, False if skiperrors==True and conversion failed
"""
if isinstance(fmt, list):
for f in fmt:
res = self._datetimeformat(
by, f, function, True if f != fmt[-1] else skiperrors)
return res
return self.applyf(by, lambda x: function(x, fmt=fmt), skiperrors)
[docs] def to_datetime(self, by, fmt='%Y-%m-%d %H:%M:%S', skiperrors=False):
"""convert a column to datetime
"""
return self._datetimeformat(by, fmt, datetime2.datetimef, skiperrors)
[docs] def to_date(self, by, fmt='%Y-%m-%d', skiperrors=False):
"""convert a column to date
"""
return self._datetimeformat(by, fmt, datetime2.datef, skiperrors)
[docs] def to_time(self, by, fmt='%H:%M:%S', skiperrors=False):
"""convert a column to time
"""
return self._datetimeformat(by, fmt, datetime2.timef, skiperrors)
[docs] def to_timedelta(self, by, fmt=None, skiperrors=False):
"""convert a column to time
"""
return self._datetimeformat(by, fmt, datetime2.timedeltaf, skiperrors)
[docs] def total(self, funcs):
"""build a footer row by appling funcs to all columns
"""
funcs = funcs+[None]*(len(self.titles)-len(funcs))
self.footer = []
for i, f in enumerate(funcs):
try:
self.footer.append(f(self.col(i)))
except:
self.footer.append(f)
return self.footer
[docs] def remove_lines_where(self, f, value=(None, 0, '')):
"""
:param f: function of the form lambda line:bool returning True if line should be removed
:return: int number of lines removed
"""
i = self._i(f)
if i is not None:
f = (lambda x: x[i] in value)
return len(itertools2.removef(self, f))
if __name__ == "__main__":
t = Table([[1, 2], [3, 'a 4']], style='width:100%;')
print(t.html())