Source code for Goulib.table

"""
"mini pandas.DataFrame"
Table class with Excel + CSV I/O, easy access to columns, HTML output, and much more.
"""
__author__ = "Philippe Guglielmetti"
__copyright__ = "Copyright 2013, Philippe Guglielmetti"
__credits__ = []
__license__ = "LGPL"


import logging

import csv
import itertools
import codecs
import json
import collections

from datetime import datetime, date, time, timedelta

from Goulib import itertools2, markup, datetime2

try:
    from lxml import etree as ElementTree
    defaultparser = ElementTree.HTMLParser
    Element = ElementTree._Element
except ImportError: #TODO: find why it happens in LiClipse
    from xml.etree import ElementTree
    Element = ElementTree.Element
    



[docs]def attr(args):
    res = ''
    for key, val in args.items():
        k = "class" if key == "_class" else key
        res += ' %s="%s"' % (k, val)
    return res


[docs]class Cell(object):
    """Table cell with HTML attributes"""

[docs]    def __init__(self, data=None, align=None, fmt=None, tag=None, style={}):
        """
        :param data: cell value(s) of any type
        :param align: string for HTML align attribute
        :param fmt: format string applied applied to data
        :param tag: called to build each cell. defaults to 'td'
        :param style: dict or string for HTML style attribute
        """

        if isinstance(data, Element):
            tag = data.tag
            assert(tag in ('td', 'th'))

            def _recurse(data):
                "grab all possible text from the cell content"
                if data is None:
                    return ''
                s = ''.join(_recurse(x) for x in data.getchildren())
                if data.text:
                    s = data.text+s
                if data.tail:
                    s = s+data.tail
                return s

            data = Cell.read(_recurse(data))

        if isinstance(data, str):
            data = data.lstrip().rstrip()  # remove spaces, but also trailing \r\n

        self.data = data
        self.align = align
        self.fmt = fmt
        self.tag = tag if tag else 'td'
        if not isinstance(style, dict):
            style = markup.style_str2dict(style)
        self.style = style

[docs]    def __str__(self):
        return str(self.data)

    def _repr_html_(self):
        return self.html()

[docs]    @staticmethod
    def read(x):
        """interprets x as int, float, string or None"""
        try:
            x = float(x)  # works for x in unicode
            xi = int(x)  # does not work if x is floating point in unicode (?)
            if xi == x:
                x = xi
        except:
            if x == '':
                x = None
        return x

[docs]    def html(self, **kwargs):
        """:return: string HTML formatted cell:

        * if data is int, default align="right"
        * if data is float, default align="right" and fmt='%0.2f'
        * if data is :class:`~datetime.timedelta`, align = "right" and formatting is done by :func:`datetime2.strftimedelta`

        """
        args = {}
        args.update(kwargs)  # copy needed to avoid side effects

        v = self.data
        f = self.fmt

        style = args.get('style', {})
        if isinstance(style, str):
            style = markup.style_str2dict(style)

        # create style dict by merging default Cell style + parameters
        # http://stackoverflow.com/questions/9819602/union-of-dict-objects-in-python
        style = dict(self.style, **style)

        if hasattr(v, '_repr_html_'):
            try:
                v = v._repr_html_()
            except Exception as e:
                v = 'ERROR : %s _repr_html_ failed : %s' % (v, e)

        elif 'text-align' not in style:  # HTML 4 and before
            a = args.pop('align', self.align)
            if isinstance(v, int):
                if not a:
                    a = "right"
            elif isinstance(v, float):
                if not a:
                    a = "right"
                if not f:
                    f = '%0.2f'
                v = f % v
                f = None  # don't reformat below
            elif isinstance(v, date):
                if not a:
                    a = "right"
                if not f:
                    f = '%Y-%m-%d'
                v = v.strftime(f)
                f = None  # don't reformat below
            elif isinstance(v, timedelta):
                if not a:
                    a = "right"
                if not f:
                    f = '%H:%M:%S'
                v = datetime2.strftimedelta(v, f)
                f = None  # don't reformat below

            if a:
                style['text-align'] = a

        if v is None or v == '':
            v = "&nbsp;"  # for IE8
        else:
            v = f % v if f else str(v)

        if v[0] == '$' and v[-1] == '$':  # LaTeX formula : avoid line wrap
            v = '$'+v+'$'

        if style:
            args['style'] = style

        return markup.tag(self.tag, v, **args)


[docs]class Row(object):
    """Table row with HTML attributes"""

[docs]    def __init__(self, data, align=None, fmt=None, tag=None, style={}):
        """
        :param data: (list of) cell value(s) of any type
        :param align: (list of) string for HTML align attribute
        :param fmt: (list of) format string applied applied to data
        :param tag: (list of) tags called to build each cell. defaults to 'td'
        :param style: (list of) dict or string for HTML style attribute
        """

        if not itertools2.isiterable(data):
            data = [data]
        data = list(data)  # to make it mutable

        # ensure params have the same length as data

        if not isinstance(style, list):
            style = [style]
        style = style+[None]*(len(data)-len(style))

        if not isinstance(align, list):
            align = [align]
        align = align+[None]*(len(data)-len(align))

        if not isinstance(fmt, list):
            fmt = [fmt]
        fmt = fmt+[None]*(len(data)-len(fmt))

        if not tag:
            tag = 'td'
        if not isinstance(tag, list):
            tag = [tag]*(len(data))  # make a full row, in case it's a 'th'
        # fill the row with None, which will be 'td's
        tag = tag+[None]*(len(data)-len(fmt))

        for i, cell in enumerate(data):
            if not isinstance(cell, Cell):
                cell = Cell(cell, align[i], fmt[i], tag[i], style[i])
            else:
                pass  # ignore attribs for now
            data[i] = cell

        self.data = data

[docs]    def __str__(self):
        return str(self.data)

    def _repr_html_(self):
        return self.html()

[docs]    def html(self, cell_args={}, **kwargs):
        """return in HTML format"""
        res = ''
        for cell in self.data:
            res += cell.html(**cell_args)
        return markup.tag('tr', res, **kwargs)


[docs]class Table(list):
    """Table class with CSV I/O, easy access to columns, HTML output"""

[docs]    def __init__(self, data=[], **kwargs):
        """inits a table, optionally by reading a Excel, csv or html file
        :param data: list of list of cells, or string as filename
        :param titles: optional list of strings used as column id
        :param footer: optional list of functions used as column reducers
        """
        try:
            self.titles = data.titles
        except AttributeError:
            self.titles = kwargs.pop('titles', [])
        try:
            self.footer = data.footer
        except AttributeError:
            self.footer = kwargs.pop('footer', [])

        filename = None
        if isinstance(data, str):
            filename = data
            data = []
        else:  # ensure data is 2D and mutable
            if isinstance(data, dict):
                data = data.values()
            for row in data:
                if not itertools2.isiterable(row):  # build a column
                    row = [row]
                self.append(row)

        if filename:
            self.load(filename, **kwargs)

[docs]    def __repr__(self):
        """:return: repr string of titles+5 first lines"""
        return '%s(len=%d,titles=%s,data=%s)' % (self.__class__, len(self), self.titles, self[:5])

[docs]    def __str__(self):
        """:return: string of full tables with linefeeds"""
        res = ''
        if self.titles:
            res += str(self.titles)+'\n'
        for line in self:
            res += str(line)+'\n'
        return res

    def _repr_html_(self):
        return self.html()

[docs]    def html(self, head=None, foot=None, **kwargs):
        """HTML representation of table

        :param head: optional column headers, .titles by default
        :param foot: optional column footers, .footer by default
        :param style: style for the table
        :param colstyle: list of dict of style attributes for each column
        :param kwargs: optional parameters passed along to tag('table'...
            except:
            * start=optional start row
            * stop=optional end row
            used to display a subset of lines. in this case rows with '...' cells
            are displayed before and/or after the lines
        :return: string HTML representation of table
        """

        def TR(data, align=None, fmt=None, tag=None, style={}):
            res = ''
            row = Row(data=data, align=align, fmt=fmt, tag=tag, style=style)
            res += row.html()+'\n'
            return res

        def THEAD(data, fmt=None, style={}):
            res = "<thead>\n"
            res += TR(data=data, fmt=fmt, tag='th', style=style)
            res += "</thead>\n"
            return res

        def TFOOT(data, fmt=None, style={}):
            res = "<tfoot>\n"
            res += TR(data=data, fmt=fmt, tag='th', style=style)
            res += "</tfoot>\n"
            return res

        res = ''

        colstyle = kwargs.pop('colstyle', None)

        if head is None:
            head = self.titles
        if head:
            res += THEAD(head)
        start = kwargs.pop('start', 0)
        stop = kwargs.pop('stop', len(self))
        if start != 0:
            res += TR(['...']*self.ncols(), style=colstyle)
        for row in self[start:stop]:
            res += TR(row, style=colstyle)
        if stop != -1 and stop < len(self):
            res += TR(['...']*self.ncols(), style=colstyle)
        if foot is None:
            foot = self.footer
        if foot:
            res += TFOOT(foot)

        # kwargs['style'] = kwargs.pop('style', {'table-layout': 'fixed'})  # to prevent wrapping

        return markup.tag('table', res, **kwargs)+'\n'

[docs]    def load(self, filename, **kwargs):
        if self.titles:  # explicitly set
            l = kwargs.setdefault('titles_line', 0)
            kwargs.setdefault('data_line', l+1)
        else:  # read titles from the file
            l = kwargs.setdefault('titles_line', 1)
            kwargs.setdefault('data_line', l+1)
        ext = filename.split('.')[-1].lower()
        if ext in ('xls', 'xlsx'):
            self.read_xls(filename, **kwargs)
        elif ext in ('htm', 'html'):
            self.read_html(filename, **kwargs)
        elif ext in ('json'):
            self.read_json(filename, **kwargs)
        else:  # try ...
            self.read_csv(filename, **kwargs)
        return self  # to allow chaining

[docs]    def read_element(self, element, **kwargs):
        """read table from a DOM element.
        :Warning: drops all formatting
        """
        titles_line = kwargs.pop('titles_line', 1)-1
        data_line = kwargs.pop('data_line', 2)-1
        line = 0
        head = element.find('thead')
        if head is not None:
            for row in head.findall('tr'):
                if line == titles_line:
                    self.titles = [cell.data for cell in Row(row).data]
                line = line+1
        body = element.find('tbody')
        if body is None:
            body = element
        for row in body.findall('tr'):
            data = [cell.data for cell in Row(row).data]
            if not data:
                continue  # skip empty lines
            if not self.titles and line == titles_line:
                self.titles = data
            elif line >= data_line:
                self.append(data)
            line = line+1
        return self

[docs]    def read_html(self, filename, **kwargs):
        """read first table in HTML file
        """
        parser = kwargs.get('parser', defaultparser)
        element = ElementTree.parse(filename, parser()).getroot()
        try:
            if element.tag != 'table':  # file contains table as topmost tag
                element = element.find('.//table')  # find first table
        except AttributeError:
            pass

        if element is None:
            raise LookupError('no table found in file')

        self.read_element(element, **kwargs)
        return self

[docs]    def read_json(self, filename, **kwargs):
        """appends a json file made of lines dictionaries"""
        with open(filename, 'r') as file:
            for row in json.load(file):
                self.append(row)
        return self

[docs]    def read_xls(self, filename, **kwargs):
        """appends an Excel table"""
        titles_line = kwargs.pop('titles_line', 1)-1
        data_line = kwargs.pop('data_line', 2)-1

        import xlrd
        wb = xlrd.open_workbook(filename)
        sheet = kwargs.get('sheet', 0)
        if isinstance(sheet, str):
            s = wb.sheet_by_name(sheet)
        else:
            s = wb.sheet_by_index(sheet)

        for i in range(s.nrows):
            line = [Cell.read(s.cell(i, j).value) for j in range(s.ncols)]
            if i == titles_line:
                self.titles = line
            elif i >= data_line:
                self.append(line)
        return self

[docs]    def read_csv(self, filename, **kwargs):
        """appends a .csv or similar file to the table"""
        titles_line = kwargs.pop('titles_line', 1)-1
        data_line = kwargs.pop('data_line', 2)-1

        dialect = kwargs.setdefault('dialect', csv.excel)
        delimiter = kwargs.setdefault('delimiter', ';')
        # must be iso-8859-15 in some cases
        encoding = kwargs.pop('encoding', 'utf-8')
        errors = kwargs.pop('errors', 'strict')

        def reader():  # version for Python 3
            with open(filename, 'rt', errors=errors, encoding=encoding) as f:
                csv_reader = csv.reader(f, **kwargs)
                for row in csv_reader:
                    yield row

        for i, row in enumerate(reader()):
            if i == titles_line:  # titles can have no left/right spaces
                self.titles = [Cell.read(x) for x in row]
            elif i >= data_line:
                line = [Cell.read(x) for x in row]
                if line != [None]:  # strange last line sometimes ...
                    self.append(line)
        return self

[docs]    def save(self, filename, **kwargs):
        ext = filename.split('.')[-1].lower()
        if ext in ('xls', 'xlsx'):
            self.write_xlsx(filename, **kwargs)
        elif ext in ('htm', 'html'):
            with open(filename, 'w') as file:
                file.write(self.html(**kwargs))
        elif ext in ('json'):
            with open(filename, 'w') as file:
                file.write(self.json(**kwargs))
        else:  # try ...
            self.write_csv(filename, **kwargs)
        return self  # to allow chaining

[docs]    def write_xlsx(self, filename, **kwargs):
        import xlsxwriter

        workbook = xlsxwriter.Workbook(filename)
        worksheet = workbook.add_worksheet()
        df = workbook.add_format({'num_format': 'dd/mm/yyyy'})
        tf = workbook.add_format({'num_format': 'hh:mm:ss'})
        dtf = workbook.add_format({'num_format': 'dd/mm/yyyy hh:mm:ss'})

        def writerow(i, line):
            for j, s in enumerate(line):
                if isinstance(s, datetime):
                    worksheet.write_datetime(i, j, s, dtf)
                elif isinstance(s, date):
                    worksheet.write_datetime(i, j, s, df)
                elif isinstance(s, (time, timedelta)):
                    worksheet.write_datetime(i, j, s, tf)
                else:
                    worksheet.write(i, j, s)

        writerow(0, self.titles)
        for i, row in enumerate(self):
            writerow(i+1, row)

        workbook.close()
        return self

[docs]    def json(self, **kwargs):
        """
        :return: string JSON representation of table
        """
        def json_serial(obj):
            """JSON serializer for objects not serializable by default json code"""
            if hasattr(obj, 'isoformat'):
                return obj.isoformat()
            try:
                return str(obj)
            except Exception:
                pass
            raise TypeError("Type %s not serializable" % (type(obj)))
        array = [self.rowasdict(i) for i in range(len(self))]
        kwargs.setdefault('default', json_serial)
        return json.dumps(array, **kwargs)

[docs]    def write_csv(self, filename, **kwargs):
        """ write the table in Excel csv format, optionally transposed"""

        dialect = kwargs.get('dialect', 'excel')
        delimiter = kwargs.get('delimiter', ';')
        encoding = kwargs.pop('encoding', 'utf-8')  # was iso-8859-15 earlier
        empty = ''.encode(encoding)

        f = open(filename, 'w', newline='', encoding=encoding)

        def _encode(line):
            res = []
            for s in line:
                res.append(s)
            return res

        writer = csv.writer(f, dialect=dialect, delimiter=delimiter)
        if self.titles:
            s = _encode(self.titles)
            writer.writerow(s)
        for line in self:
            s = _encode(line)
            writer.writerow(s)
        f.close()
        return self

[docs]    def __eq__(self, other):
        """compare 2 Tables contents, mainly for tests"""
        if self.titles != other.titles:
            return False
        if len(self) != len(other):
            return False
        for i in range(len(self)):
            if self[i] != other[i]:
                return False
        return True

[docs]    def ncols(self):
        """
        :return: number of columns, ignoring title
        """

        return max(map(len, self))

[docs]    def find_col(self, title):
        """finds a column from a part of the title"""
        title = title.lower()
        for i, c in enumerate(self.titles):
            if c.lower().find(title) >= 0:
                return i
        return None

    def _i(self, column):
        '''column index'''
        if isinstance(column, int):
            return column
        try:
            return self.titles.index(column)
        except ValueError:
            return None

[docs]    def icol(self, column):
        """iterates a column"""
        i = self._i(column)
        for row in self:
            try:
                yield row[i]
            except:
                yield None

[docs]    def col(self, column, title=False):
        i = self._i(column)
        res = [x for x in self.icol(i)]
        if title:
            res = [self.titles[i]]+res
        return res

[docs]    def cols(self, title=False):
        """iterator through columns"""
        for i in range(self.ncols()):
            yield self.col(i, title)

[docs]    def transpose(self, titles_column=0):
        """transpose table
        :param: titles_column
        :return: Table where rows are made from self's columns and vice-versa
        """
        res = Table()
        for i, row in enumerate(self.cols(self.titles)):
            if i == titles_column:
                res.titles = row
            else:
                res.append(row)
        return res

[docs]    def index(self, value, column=0):
        """
        :return: int row number of first line where column contains value
        """
        for i, v in enumerate(self.icol(column)):
            if v == value:
                return i
        return None

[docs]    def __getitem__(self, n):
        try:
            c = self._i(n[1])
        except TypeError:
            return super(Table, self).__getitem__(n)
        else:
            rows = super(Table, self).__getitem__(n[0])
            if not isinstance(n[0], slice):
                return rows[c]
            return [row[c] for row in rows]

[docs]    def get(self, row, col):
        return self[row, col]

[docs]    def set(self, row, col, value):
        col = self._i(col)
        if row >= len(self):
            self.extend([list()]*(1+row-len(self)))
        if col >= len(self[row]):
            self[row].extend([None]*(1+col-len(self[row])))
        self[row][col] = value

[docs]    def setcol(self, col, value, i=0):
        """set column values
        :param col: int or string column index
        :param value: single value assigned to whole column or iterable assigned to each cell
        :param i: optional int : index of first row to assign
        """
        j = self._i(col)
        if itertools2.isiterable(value):
            for v in value:
                self.set(i, j, v)
                i += 1
        else:
            for i in range(i, len(self)):
                self.set(i, j, value)

[docs]    def append(self, line):
        ''' appends a line to table
        :param line: can be either:
        * a list
        * a dict or column names:values
        '''
        if isinstance(line, dict):
            r = len(self)  # row number
            for k, v in line.items():
                i = self._i(k)
                if i is None:  # column doesn't exist:
                    i = len(self.titles)
                    self.titles.append(k)
                self.set(r, i, v)
        else:
            list.append(self, list(line))
        return self

[docs]    def addcol(self, title, val=None, i=0):
        '''add column to the right'''
        col = len(self.titles)
        self.titles.append(title)
        if not itertools2.isiterable(val):
            val = [val]*(len(self)-i)
        for v in val:
            self.set(i, col, v)
            i += 1
        return self

[docs]    def sort(self, by, reverse=False):
        '''sort by column'''
        i = self._i(by)
        if isinstance(i, int):
            list.sort(self, key=lambda x: x[i], reverse=reverse)
        else:
            list.sort(i, reverse=reverse)

[docs]    def rowasdict(self, i):
        ''' returns a line as a dict '''
        return collections.OrderedDict(zip(self.titles, self[i]))

[docs]    def asdict(self):
        for i in range(len(self)):
            yield self.rowasdict(i)

[docs]    def groupby_gen(self, by, sort=True, removecol=True):
        """generates subtables
        """
        i = self._i(by)
        t = self.titles
        if removecol:
            t = t[:i]+t[i+1:]
        if sort:
            self.sort(i)
        else:
            pass  # groupby will group CONSECUTIVE lines with same i, so entries at bottom of table will replace the earlier entries in dict
        for k, g in itertools.groupby(self, key=lambda x: x[i]):
            if removecol:
                r = Table(titles=t, data=[a[:i]+a[i+1:] for a in g])
            else:
                r = Table(titles=t, data=list(g))
            yield k, r

[docs]    def groupby(self, by, sort=True, removecol=True):
        """ ordered dictionary of subtables
        """
        return collections.OrderedDict(
            (k, t) for (k, t) in self.groupby_gen(by, sort, removecol)
        )

[docs]    def hierarchy(self, by='Level',
                  # creates an object from a line
                  factory=lambda row: (row, []),
                  # creates a parend/child relation between x and y. raw is also available (for qty)
                  linkfct=lambda x, y, row: x[1].append(y)
                  ):
        '''builds a structure from a table containing a "level" column'''
        res = []
        i = self._i(by)
        stack = []
        for row in self:
            obj = factory(row)
            level = row[i]
            if level == 1:
                res.append(obj)
            while level <= len(stack):
                stack.pop()
            if stack:
                linkfct(stack[-1], obj, row)
            stack.append(obj)
        return res

[docs]    def applyf(self, by, f, skiperrors=False):
        """ apply a function to a column
        :param by: column name of number
        :param f: function of the form lambda cell:content
        :param skiperrors: bool. if True, errors while running f are ignored
        :return: bool True if ok, False if skiperrors==True and conversion failed
        """
        res = True
        i = self._i(by)
        for row in self:
            try:
                x = row[i]
                row[i] = f(x)
            except Exception as e:
                if not skiperrors:
                    # TODO: change message to ('could not applyf to %s'%x) for Py2+3
                    raise
                res = False
        return res

    def _datetimeformat(self, by, fmt, function, skiperrors):
        """convert a column to a date, time or datetime
        :param by: column name of number
        :param fmt: string defining format, or list of formats to try one by one
        :param function: function to call
        :param skiperrors: bool. if True, conversion errors are ignored
        :return: bool True if ok, False if skiperrors==True and conversion failed
        """
        if isinstance(fmt, list):
            for f in fmt:
                res = self._datetimeformat(
                    by, f, function, True if f != fmt[-1] else skiperrors)
            return res
        return self.applyf(by, lambda x: function(x, fmt=fmt), skiperrors)

[docs]    def to_datetime(self, by, fmt='%Y-%m-%d %H:%M:%S', skiperrors=False):
        """convert a column to datetime
        """
        return self._datetimeformat(by, fmt, datetime2.datetimef, skiperrors)

[docs]    def to_date(self, by, fmt='%Y-%m-%d', skiperrors=False):
        """convert a column to date
        """
        return self._datetimeformat(by, fmt, datetime2.datef, skiperrors)

[docs]    def to_time(self, by, fmt='%H:%M:%S', skiperrors=False):
        """convert a column to time
        """
        return self._datetimeformat(by, fmt, datetime2.timef, skiperrors)

[docs]    def to_timedelta(self, by, fmt=None, skiperrors=False):
        """convert a column to time
        """
        return self._datetimeformat(by, fmt, datetime2.timedeltaf, skiperrors)

[docs]    def total(self, funcs):
        """build a footer row by appling funcs to all columns
        """
        funcs = funcs+[None]*(len(self.titles)-len(funcs))
        self.footer = []
        for i, f in enumerate(funcs):
            try:
                self.footer.append(f(self.col(i)))
            except:
                self.footer.append(f)
        return self.footer

[docs]    def remove_lines_where(self, f, value=(None, 0, '')):
        """
        :param f: function of the form lambda line:bool returning True if line should be removed
        :return: int number of lines removed
        """
        i = self._i(f)
        if i is not None:
            f = (lambda x: x[i] in value)
        return len(itertools2.removef(self, f))


if __name__ == "__main__":
    t = Table([[1, 2], [3, 'a 4']], style='width:100%;')
    print(t.html())