Source code for toto.inputs.xls

"""Read xls,xlsx file
    This import Excel type file. The function uses the read_excel function from panda <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html>_.
    This class returns a Panda Dataframe with some extra attributes such as Latitude,Longitude,Units.
    
    Parameters
    ~~~~~~~~~~

    filename : (files,) str or list_like
        A list of filename to process.
    sheet_name : str, int, list, or None, default 0
        Strings are used for sheet names. Integers are used in zero-indexed
        sheet positions. Lists of strings/integers are used to request
        multiple sheets. Specify None to get all sheets.

        Available cases:

        * Defaults to ``0``: 1st sheet as a `DataFrame`
        * ``1``: 2nd sheet as a `DataFrame`
        * ``"Sheet1"``: Load sheet with name "Sheet1"
        * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
          as a dict of `DataFrame`
        * None: All sheets.
    colNames : List, default []
        List of column names to use.
    unitNames : List, default []
        List of unit to use. 
    miss_val : scalar, str, list-like, or dict, optional
        Additional strings to recognize as NA/NaN. If dict passed, specific
        per-column NA values. 
    colNamesLine : int, default 1
        Line number where the header are defined
    skiprows : list-like, int or callable, optional
        Line numbers to skip (0-indexed) or number of lines to skip (int)
        at the start of the file.
        If callable, the callable function will be evaluated against the row
        indices, returning True if the row should be skipped and False otherwise.
        An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
    skipfooter : int, default 0
        Number of lines at bottom of file to skip (Unsupported with engine='c').
    unitNamesLine : int, default 1
        Line number where the units are defined 
    single_column : bool, default False
        The time is represented in a single column
    customUnit : str, default '%d-%m-%Y %H:%M:%S'
        String reprensenting the time format
    unit : str default 's', can be 'auto','custom','matlab' or 's' and 'D'
        unit of the single column time. Only matter if `single_column` is True
    time_col_name: dict, default {'Year':'year','Month':'month','Day':'day','Hour':'hour','Min':'Minute','Sec':'Second'}
        Dictonary for renaming the each column, so Panda can interprate the time. Only matter if `single_column` is False
    

    Examples
    ~~~~~~~~
    
    >>> from toto.inputs.xls import XLSfile
    >>> tx=XLSfile([filename],sheetnames='test3', colNames= [], unitNames= [],miss_val='NaN', colNamesLine= 1, skiprows= 2, unitNamesLine= 0,\
    skipfooter= 0, single_column= True, unit= 's',\
    customUnit= '%d-%m-%Y %H:%M:%S', time_col_name= {})
    >>> tx.reads()
    >>> tx.read_time()
    >>> df=tx._toDataFrame()

"""
import glob,os,sys
import pandas as pd
import datetime as dt
import numpy as np
from openpyxl import load_workbook
_NUMERIC_KINDS = set('buifc')


[docs]class XLSfile():
[docs] @staticmethod def defaultExtensions(): return ['.xls','.xlsx']
def __init__(self,filename,sheetnames=[],\ colNames=[],\ unitNames=[],\ miss_val='NaN',\ colNamesLine=1,\ skiprows=0,\ unitNamesLine=0,\ skipfooter=0,\ single_column=False,\ unit='s',\ customUnit='%d-%m-%Y %H:%M:%S',\ time_col_name={'Year':'year','Month':'month','Day':'day','Hour':'hour','Min':'Minute','Sec':'Second'},\ ): self.filename = filename self.fileparam=lambda:None self.fileparam.sheetnames=sheetnames self.fileparam.colNames = colNames self.fileparam.unitNames = unitNames self.fileparam.miss_val = miss_val self.fileparam.ext=self.defaultExtensions() self.fileparam.colNamesLine = colNamesLine self.fileparam.skiprows = skiprows self.fileparam.unitNamesLine = unitNamesLine self.fileparam.skipfooter = skipfooter self.fileparam.single_column=single_column self.fileparam.unit=unit self.fileparam.customUnit=customUnit self.fileparam.time_col_name=time_col_name self.encoding = None self.data=[] # set usr defined parameter
[docs] def reads(self): for file in self.filename: self.read(file)
[docs] def read(self,filename): wb = load_workbook(filename) if self.fileparam.sheetnames==[]: self.fileparam.sheetnames=list(wb.keys()) if isinstance(self.fileparam.sheetnames,str): self.fileparam.sheetnames=[self.fileparam.sheetnames] # # unit header if self.fileparam.unitNamesLine: for i,sheet in enumerate(self.fileparam.sheetnames): df=wb[sheet] unit={} unitNames=[x.value for x in df[self.fileparam.unitNamesLine] if x.value] for i,col in enumerate(self.fileparam.colNames): unit[col]=unitNames[i].replace('[','').replace(']','') self.fileparam.unitNames.append(unit) for i,sheet in enumerate(self.fileparam.sheetnames): df=wb[sheet] colname=[x.value for x in df[self.fileparam.colNamesLine] if x.value] self.fileparam.colNames.append(colname) try: df=pd.read_excel(filename,sheet_name=sheet,skiprows=self.fileparam.skiprows,\ header=None,names=colname,skipfooter=self.fileparam.skipfooter,\ na_values=self.fileparam.miss_val,engine='openpyxl') except pd.errors.ParserError as e: raise WrongFormatError('XLS File {}: '.format(filename)+e.args[0]) keys=list(df.keys()) for key in keys: if np.all(df[key].isna()): del df[key] # ### check if all numerics # if not np.asarray(df[key].values).dtype.kind in _NUMERIC_KINDS: # print('Warning %s: this key was deleted containes string' % key) # del df[key] self.data.append(df)
[docs] def read_time(self): for i,df in enumerate(self.data): if self.fileparam.single_column is True: if self.fileparam.unit == 'auto': time=pd.to_datetime(df[self.fileparam.time_col_name]) elif self.fileparam.unit == 's' or self.fileparam.unit == 'D': time=pd.to_datetime(df[self.fileparam.time_col_name],unit=self.fileparam.unit) elif self.fileparam.unit == 'custom': time=pd.to_datetime(df[self.fileparam.time_col_name],format=self.fileparam.customUnit) elif self.fileparam.unit == 'excel': time=pd.TimedeltaIndex(df[self.fileparam.time_col_name], unit='d') + dt.datetime(1970,1,1) time=pd.to_datetime(time) del df[self.fileparam.time_col_name] else: old_name=self.fileparam.time_col_name.keys() time=pd.to_datetime(df[old_name].rename(columns=self.fileparam.time_col_name)) for oldkey in old_name: del df[oldkey] self.data[i]=df self.data[i]['time']=time print(time) self.data[i].set_index('time',inplace=True,drop=False) self.add_unit(i) keys=self.data[i].keys() for key in keys: self.data[i][key].long_name=key
[docs] def add_unit(self,i): keys=self.data[i].keys() for key in keys: if key in self.fileparam.unitNames: units=self.fileparam.unitNames[key] self.data[i][key].units=units elif '[' in key and ']' in key: a,b=key.split('[') self.data[i].rename(columns={key: a},inplace=True) self.data[i][a].units=b.split(']')[0]
def _toDataFrame(self): return self.data