Source code for barak.io

""" Functions to read and write text, fits and pickle files.
"""
from itertools import izip
import cPickle as pickle
import os, gzip
import numpy as np
from utilities import adict

[docs]def readtxt(fh, sep=None, usecols=None, comment='#', skip=0,
            arrays=True, names=None, readnames=False, converters=None,
            mintype=int):
    """ Reads columns from a text file into arrays, converting to int,
    float or str where appropriate.

    By default the column separator is whitespace. `rows` can be
    either an input filename or an iterable (e.g. a file object, list
    or iterator).

    Parameters
    ----------
    rows : filename or iterable object
        Input data.
    sep : str (default `None`)
        A string used to separate items on a row (also known as a
        delimiter). Default is None, which means whitespace.
    usecols : int or tuple of ints, optional
        Indices of columns to be read. By default all columns are read.
    comment : str (default `#`)
        Character marking the start of a comment.
    skip : int (default `0`)
        Number of rows to skip (not counting commented or blank lines)
        before reading data.
    arrays : bool (`True`)
        If True, all columns are converted to Numpy arrays.  If False,
        columns are returned as lists.
    names : str or sequence of str (default `None`)
        If `names` is given and `arrays` is True, the data are placed
        in a Numpy record array with field names given by `names`. Can
        also be a single string of comma-separated values.
    readnames : bool (`False`)
        If `readnames` is True the first line of the file is read to
        find the field names. This overrides the `names` keyword.
    converters : dict (`None`)
        Functions to apply to each entry of a column. Each (key,value)
        pair gives the column index (key) and the function to be
        applied to each entry in that column (value).

    Returns either structured array or lists.

    Examples
    --------
    >>> list_of_all_cols = readtxt('filename')
    >>> ninthcol, fifthcol = readtxt('filename', sep=',', usecols=(8,4)])
    >>> firstcol = readtxt('filename', comment='%', usecols=[0])
    >>> recarray = readtxt('filename', sep=',', usecols=(1,3), names='x,y'])
    """
    if mintype == float:
        typedict = {float : lambda x: str(x).strip()}
    elif mintype == int:
        typedict = {int : float,
                    float : lambda x: str(x).strip()}
    else:
        raise ValueError('Unknown minimum type %s' % mintype)

    def convert(row, funcs):
        # convert each item in a row to int, float or str.
        for i,item in enumerate(row):
            while True:
                try:
                    row[i] = funcs[i](item)
                except ValueError:
                    # update the list of converters
                    try:
                        funcs[i] = typedict[funcs[i]]
                    except KeyError:
                        raise ValueError('Converter %s failed '
                                         'on %r' % (funcs[i], item))
                else:
                    break
        return row,funcs

    needclose = False
    if isinstance(fh, basestring):
        if fh.endswith('.gz'):
            import gzip
            fh = gzip.open(fh)
        else:
            fh = open(fh)
        needclose = True

    data = iter(fh)

    if comment is not None:
        len_comment = len(comment)

    if names and isinstance(names, basestring):
        names = [n.strip() for n in names.split(',')]

    skipped = 0
    out = []
    # main loop to read data
    for irow, row in enumerate(data):
        if comment is not None:
            row = row.split(comment)[0]
        row = row.lstrip()
        if not row:  continue
        if skipped < skip:
            skipped += 1
            continue
        row = row.split(sep)
        if readnames:
            names = [r.strip() for r in row]
            readnames = False
            continue
        if not out:
            # first row with data, so initialise converters
            funcs = [mintype] * len(row)
            if converters is not None:
                for i in converters:
                    funcs[i] = converters[i]
            if usecols is not None:
                funcs = [funcs[i] for i in usecols]
        if usecols is not None:
            try:
                row = [row[i] for i in usecols]
            except IndexError:
                raise IndexError('Columns indices: %s, but only %i entries in '
                                 'this row!' % (usecols, len(row)))
        try:
            row, funcs = convert(row, funcs)
        except IndexError:
            # Probably there are more items in this row than in
            # previous rows. This usually indicates a problem, so
            # raise an error.
            raise IndexError('Too many items on row %i: %s' % (irow+1, row))

        if names:
            assert len(row) == len(names), '%i, %i, %s ' % (
                len(names), irow+1, row)
        out.append(row)

    if needclose:
        fh.close()

    # rows to columns, truncating to number of words on shortest line.
    if arrays:
        if names is not None:
            out = np.rec.fromrecords(out, names=names)
        else:
            out = [np.array(c) for c in izip(*out)]
    else:
        out = [list(c) for c in izip(*out)]

    if len(out) == 1 and names is None:
        return out[0]
    else:
        return out


[docs]def writetxt(fh, cols, sep=' ', names=None, header=None, overwrite=False,
             fmt_float='s'):
    """ This is deprecated. Use `writetable()` with file type '.tbl'
    instead.

    Write data to a column-aligned text file.

    Structured array data written using this function can be read
    again using:

    >>> readtxt(filename, readnames=True)

    Parameters
    ----------
    fh :  file object or str
        The file to be written to.
    cols : structured array or a list of columns
        Data to be written.
    sep : str (' ')
        A string used to separate items on each row.
    names : list, string, False or None (None)
        Column names. Can be a comma-separated string of names. If
        False, do not print any names. If None and `cols` is a
        structured array, column names are the array field names.
    header : str (None)
        A header written before the data and column names.
    overwrite : bool (False)
        If True, overwrite an existing file without prompting.
    """
    # Open file (checking whether it already exists)
    if isinstance(fh, basestring):
        if not overwrite:
            while os.path.lexists(fh):
                c = raw_input('File %s exists, overwrite? (y)/n: ' % fh)
                if c == '' or c.strip().lower()[0] != 'n':
                    break
                else:
                    fh = raw_input('Enter new filename: ')
        fh = open(fh, 'w')

    if isinstance(names, basestring):
        names = names.split(',')

    try:
        recnames = cols.dtype.names
    except AttributeError:
        pass
    else:
        if names not in (None, False):
            recnames = names
        cols = [cols[n] for n in recnames]
        if names is None:
            names = list(recnames)

    cols = [np.asanyarray(c) for c in cols]

    if names not in (None, False):
        if len(names) < len(cols):
            raise ValueError('Need one name for each column!')

    nrows = [len(c) for c in cols]
    if max(nrows) != min(nrows):
        raise ValueError('All columns must have the same length!')
    nrows = nrows[0]

    # Get the maximum field width for each column, so that the columns
    # will line up when printed. Also find the printing format for
    # each column.
    maxwidths = []
    formats = []
    for col in cols:
        dtype = col.dtype.str[1:]
        if dtype.startswith('S'):
            maxwidths.append(int(dtype[1:]))
            formats.append('s')
        elif dtype.startswith('i'):
            maxwidths.append(max([len('%i' % i) for i in col]))
            formats.append('i')
        elif dtype.startswith('f'):
            maxwidths.append(max([len(('%' + fmt_float) % i) for i in col]))
            formats.append(fmt_float)
        elif dtype.startswith('b'):
            maxwidths.append(1)
            formats.append('i')
        else:
            raise ValueError('Unknown column data-type %s' % dtype)

    if names not in (None, False):
        for i,name in enumerate(names):
            maxwidths[i] = max(len(name), maxwidths[i])

    fmt = sep.join(('%-'+str(m)+f) for m,f in zip(maxwidths[:-1], formats[:-1]))
    fmt += sep + '%' + formats[-1] + '\n'

    if names:
        fmtnames = sep.join(('%-' + str(m) + 's') for m in maxwidths[:-1])
        fmtnames += sep + '%s\n'

    # Write the header if it was given
    if header is not None:
        fh.write(header)

    if names:
        fh.write(fmtnames % tuple(names))
    for row in izip(*cols):
        fh.write(fmt % tuple(row))

    fh.close()
    return

[docs]def writetabfits(filename, rec, units=None, overwrite=True):
    """ This is deprecated. Use `writetable()` with file type '.fits'
    instead.

    Writes a list of numpy arrays or a structured array to a
    binary fits table. Works best with structured arrays.

    Parameters
    ----------
    filename : str
      Filename to write to.
    rec : Sequence of arrays or record array
      Data to write.
    units : list of str (default None)
      Sequence of strings giving the units for each column.
    """
    import pyfits

    fmts = dict(f4='E', f8='F', i2='I', i4='J', i8='K', b1='L')

    try:
        rec.dtype
    except AttributeError:
        rec = np.rec.fromarrays(rec)
    if rec.dtype.names is None:
        raise ValueError('Input must be a list of columns or a '
                         'structured array')
    if units is None:
        units = [None] * len(rec.dtype.descr)

    cols = []
    for unit, name in zip(units, rec.dtype.names):
        a = rec[name]
        dtype = a.dtype.str[1:]
        if dtype.startswith('S'):
            fmt = 'A' + dtype[1:]
        else:
            fmt = fmts[dtype]
        cols.append(pyfits.Column(name=name, format=fmt, array=a, unit=unit))

    tbhdu = pyfits.new_table(pyfits.ColDefs(cols))
    tbhdu.writeto(filename, clobber=overwrite)

[docs]def readtabfits(filename, ext=None):
    """ Read fits binary table data, such as that written by
    `writetabfits()`.

    Consider using `atpy.Table(filename)` instead.
    """
    import pyfits
    if ext is not None:
        return pyfits.getdata(filename, ext=ext).view(np.recarray)
    else:
        return pyfits.getdata(filename).view(np.recarray)

[docs]def saveobj(filename, obj, overwrite=False):
    """ Save a python object to filename using pickle."""
    if os.path.lexists(filename) and not overwrite:
        raise IOError('%s exists' % filename)
    if filename.endswith('.gz'):
        fh = gzip.open(filename, 'wb')
    else:
        fh = open(filename, 'wb')
    pickle.dump(obj, fh, protocol=2)
    fh.close()

[docs]def loadobj(filename):
    """ Load a python object pickled with saveobj."""
    if filename.endswith('.gz'):
        fh = gzip.open(filename, 'rb')
    else:
        fh = open(filename, 'rb')
    obj = pickle.load(fh)
    fh.close()
    return obj

[docs]def parse_config(filename, defaults={}):
    """ Read options for a configuration file.

    Parameters
    ----------
    filename : str or file object
      The configuration filename or a file object.
    defaults : dict
      A dictionary with default values for options.

    Returns
    -------
    d : dictionary
      The options are returned as a dictionary that can also be
      indexed by attribute.

    Notes
    -----
    Ignores blank lines, lines starting with '#', and anything on a
    line after a '#'. The parser attempts to convert the values to
    int, float or boolean, otherwise they are left as strings.

    Sample format::

     # this is the file with the line list
     lines = lines.dat
     x = 20
     save = True    # save the data
    """
    cfg = adict()

    cfg.update(defaults)

    if isinstance(filename, basestring):
        fh = open(filename)
    else:
        fh = filename

    for row in fh:
        if not row.strip() or row.lstrip().startswith('#'):
            continue
        option, value = [r.strip() for r in row.split('#')[0].split('=', 1)]
        try:
            value = int(value)
        except ValueError:
            try:
                value = float(value)
            except ValueError:
                if value == 'True':
                    value = True
                elif value == 'False':
                    value = False
                elif value == 'None':
                    value = None

        cfg[option] = value

    fh.close()
    return cfg

[docs]def readsex(filename, catnum=None):
    """ Read a sextractor catalogue into a Numpy record array.

    Parameters
    ----------
    filename : str
      Sextractor output catalogue name
    catnum : int, optional
      If the Sextractor file is in LDAC_FITS format and contains more
      than one catalogue, this option specifies the catalogue number.

    Returns
    -------
    s : numpy record array
      Record array with field names the same as those in the
      sextractor catalogue.
    """
    fh = open(filename)
    # get the header
    row = fh.next()
    while not row.strip():
        row = fh.next()
    if row[8] == '=':
        fh.close()
        # assume a fits file
        try:
            import pyfits
        except ImportError:
            raise ValueError("Install Pyfits to read fits files")
        fh = pyfits.open(filename)
        if len(fh) > 3 and catnum is None:
            raise ValueError("specify catalogue number")
        elif catnum is not None:
            return pyfits.getdata(filename, catnum*2).view(np.recarray)
        else:
            return pyfits.getdata(filename, 2).view(np.recarray)
    hd = []
    while row.startswith('#'):
        if row[1:].strip():
            hd.append(row)
        row = fh.next()
    fh.close()
    # get column numbers and names
    number, names = zip(*[row.split() for row in hd])[1:3]
    indcol = [int(c)-1 for c in number]
    if len(names) - len(set(names)):
        dup = [n for n in set(names) if names.count(n) > 1]
        raise ValueError('fields with same names: %s' % dup)
    # read in the data
    return readtxt(filename, names=names, usecols=indcol)

[docs]def sex_to_DS9reg(filename, s, colour='green', tag='all', withtext=False):
    """Write a DS9 region file from SExtractor output.

    Parameters
    ----------
    filename : str
      Region file name.
    s : array
      The output of `readsex`.
    colour : str ('green')
      Region colour. One of {cyan blue magenta red green yellow white
      black}
    tag : str ('all')
      DS9 tag for all the regions
    with_text : bool (False)
      If True, then mark each region with either its magnitude (if
      given), otherwise its index in the input array `s`.
    """

    names = set(s.dtype.names)
    regions = ['global font="helvetica 10 normal" select=1 highlite=1 '
               'edit=0 move=1 delete=1 include=1 fixed=0 source']
    regions.append('image')
    fields = ['X_IMAGE', 'Y_IMAGE']
    if not ('X_IMAGE' in names and 'Y_IMAGE' in names):
        fields = ['XWIN_IMAGE', 'YWIN_IMAGE']
        if not ('XWIN_IMAGE' in names and 'YWIN_IMAGE' in names):
            raise ValueError('require either X_IMAGE and Y_IMAGE '
                             'or XWIN_IMAGE and YWIN_IMAGE')

    fmt = 'ellipse(%s %s %s %s %s) # text={%s} color=%s tag={%s}'
    ellipse_vals = ['A_IMAGE','B_IMAGE','THETA_IMAGE']
    ellipsewin_vals = ['AWIN_IMAGE','BWIN_IMAGE','THETAWIN_IMAGE']
    if all((n in names) for n in ellipse_vals):
        fields = list(fields) +  ellipse_vals
    elif all((n in names) for n in ellipsewin_vals):
        fields = list(fields) +  ellipsewin_vals
    else:
        # we don't have any ellipticity info, just write points.
        fmt = 'point(%s %s) # point=circle text={%s} color=%s tag={%s}'

    for i,rec in enumerate(s):
        vals = [rec[f] for f in fields]
        if withtext:
            if 'MAG_AUTO' in names:
                text = '%i %.2f' % (i+1, rec['MAG_AUTO'])
            else:
                text = i+1
        else:
            text = ''
        vals.extend([text, colour, tag])
        regions.append(fmt % tuple(vals))

    fh = open(filename,'w')
    fh.write('\n'.join(regions))
    fh.close()

[docs]def write_DS9reg(x, y, filename=None, coord='IMAGE', ptype='x', size=20,
                 c='green', tag='all', width=1, text=None):
    """Write a region file for ds9 for a  list of coordinates.

    Parameters
    ----------
    x, y : arrays of floats, shape (N,)
      The coordinates. These may be image or WCS.
    filename : str, optional
      A filename to write to.
    coord : str  ('IMAGE')
      The coordinate type. For example IMAGE (pixel coordinates) or
      J2000.
    ptype : str ('x')
      DS9 point type. One of {circle box diamond cross x arrow
      boxcircle}
    size : int (20)
      DS9 point size.
    c : str ('green')
      point colour: one of {cyan blue magenta red green yellow white
      black}.
    tag : str ('all')
      DS9 tag.
    width : int (1)
    """
    regions = ['global font="helvetica 10 normal" select=1 highlite=1 '
               'edit=0 move=1 delete=1 include=1 fixed=0 source\n']
    regions.append(coord + '\n')

    def iscontainer(s):
        try:
            it = iter(s)
        except TypeError:
            return False
        else:
            if isinstance(s, basestring) and len(s) != len(x):
                return False
        return True

    if not iscontainer(ptype):
        ptype = [ptype] * len(x)
    if not iscontainer(size):
        size = [size] * len(x)
    if not iscontainer(width):
        width = [width] * len(x)
    if not iscontainer(text):
        text = range(len(x))
    if not iscontainer(c):
        c = [c] * len(x)
    if not iscontainer(tag):
        tag = [tag] * len(x)

    fmt = ('point(%12.8f,%12.8f) # \
point=%s %s width=%s text={%s} color=%s tag={%s}\n')
    for i in xrange(len(x)):
        vals = (x[i], y[i], ptype[i], size[i], width[i], text[i],
                c[i], tag[i])
        regions.append(fmt % vals)

    if filename is not None:
        fh = open(filename,'w')
        fh.writelines(regions)
        fh.close()
    return regions

[docs]def writetable(filename, cols, units=None, names=None, header=None,
               keywords=None, overwrite=False):
    """ Write a series of data columns to a file.

    Data written using this function can be read again using:

    >>> atpy.Table(filename)

    Parameters
    ----------
    filename :  str
        The output filename. Its suffix determines the file type. For
        example '.tbl', '.fits' or '.fits.gz'.
    cols : structured array, atpy Table instance or a list of columns
        Data to be written.
    units : list
        Units of each column.
    names : list or string  (None)
        Column names. Can be a comma-separated string of names. If
        None and `cols` is a structured array, column names are the
        array field names.
    header : str (None)
        A header written before the data.
    keywords : dict (None)
        A dictionary of key-value pairs to write to the header.
    overwrite : bool (False)
        If True, overwrite an existing file without prompting.
    """
    import atpy

    if isinstance(cols, atpy.Table):
        t = cols
        old_formats = [t.columns[k].format for k in t.keys()]
    else:
        try:
            recnames = cols.dtype.names
        except AttributeError:
            assert np.allclose(len(cols[0]), [len(col) for col in cols[1:]])
        else:
            if names is not None:
                recnames = names
            else:
                names = list(recnames)
            cols = [cols[n] for n in recnames]

        if names is None:
            names = ['col%i' % (i+1) for i in range(len(cols))]
        elif isinstance(names, basestring):
            names = names.split(',')

        if units is None:
            units = [''] * len(names)

        t = atpy.Table()
        for i in xrange(len(cols)):
            t.add_column(names[i], cols[i], unit=units[i])

        if header is not None:
            for comment in header.split('\n'):
                t.add_comment(comment)

        if keywords is not None:
            for key,value in keywords.iteritems():
                t.add_keyword(key, value)

    if filename.endswith('.tbl') or filename.endswith('.tbl.gz'):
        # use str for int and floats to remove whitespace and make
        # easily-readable float values in IPAC tables - be warned this
        # may change the printed float values by about one part in
        # 1e12.
        for name in t.keys():
            if t.columns[name].format.endswith('s'):
                continue
            width = 0
            for item in t.data[name]:
                width = max(width, len(str(item)))
            t.columns[name].format = str(width) + 's'

    t.write(filename, overwrite=overwrite)

    if isinstance(cols, atpy.Table):
        # return column formats to their original values
        for fmt in old_formats:
            t.columns[name].format = fmt
Barak 0.3.2 documentation

barak.io

Source code for barak.io