Source code for piff.outliers

# Copyright (c) 2016 by Mike Jarvis and the other collaborators on GitHub at
# https://github.com/rmjarvis/Piff  All rights reserved.
#
# Piff is free software: Redistribution and use in source and binary forms
# with or without modification, are permitted provided that the following
# conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the disclaimer given in the accompanying LICENSE
#    file.
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the disclaimer given in the documentation
#    and/or other materials provided with the distribution.

"""
.. module:: outliers
"""

import math
import numpy as np
import math
import galsim
from scipy.stats import chi2

from .util import write_kwargs, read_kwargs

[docs]class Outliers(object): """The base class for handling outliers. This is essentially an abstract base class intended to define the methods that should be implemented by any derived class. """
[docs] @classmethod def process(cls, config_outliers, logger=None): """Parse the outliers field of the config dict. :param config_outliers: The configuration dict for the outliers field. :param logger: A logger object for logging debug info. [default: None] :returns: an Outliers instance """ import piff if 'type' not in config_outliers: raise ValueError("config['outliers'] has no type field") # Get the class to use for the outliers # Not sure if this is what we'll always want, but it would be simple if we can make it work. outliers_class = getattr(piff, config_outliers['type'] + 'Outliers') # Read any other kwargs in the outliers field kwargs = outliers_class.parseKwargs(config_outliers, logger) # Build outliers object outliers = outliers_class(**kwargs) return outliers
[docs] @classmethod def parseKwargs(cls, config_outliers, logger=None): """Parse the outliers field of a configuration dict and return the kwargs to use for initializing an instance of the class. The base class implementation just returns the kwargs as they are, but derived classes might want to override this if they need to do something more sophisticated with them. :param config_outliers: The outliers field of the configuration dict, config['outliers'] :param logger: A logger object for logging debug info. [default: None] :returns: a kwargs dict to pass to the initializer """ kwargs = {} kwargs.update(config_outliers) kwargs.pop('type',None) return kwargs
[docs] def write(self, fits, extname): """Write an Outliers to a FITS file. :param fits: An open fitsio.FITS object :param extname: The name of the extension to write the outliers information. """ # First write the basic kwargs that works for all Outliers classes outliers_type = self.__class__.__name__ write_kwargs(fits, extname, dict(self.kwargs, type=outliers_type)) # Now do any class-specific steps. self._finish_write(fits, extname)
def _finish_write(self, fits, extname): """Finish the writing process with any class-specific steps. The base class implementation doesn't do anything, which is often appropriate, but this hook exists in case any Outliers classes need to write extra information to the fits file. :param fits: An open fitsio.FITS object :param extname: The base name of the extension """ pass
[docs] @classmethod def read(cls, fits, extname): """Read a Outliers from a FITS file. :param fits: An open fitsio.FITS object :param extname: The name of the extension with the outliers information. :returns: an Outliers handler """ import piff assert extname in fits assert 'type' in fits[extname].get_colnames() outliers_type = fits[extname].read()['type'] assert len(outliers_type) == 1 try: outliers_type = str(outliers_type[0].decode()) except AttributeError: # fitsio 1.0 returns strings outliers_type = outliers_type[0] # Check that outliers_type is a valid Outliers type. outliers_classes = piff.util.get_all_subclasses(piff.Outliers) valid_outliers_types = dict([ (c.__name__, c) for c in outliers_classes ]) if outliers_type not in valid_outliers_types: raise ValueError("outliers type %s is not a valid Piff Outliers"%outliers_type) outliers_cls = valid_outliers_types[outliers_type] kwargs = read_kwargs(fits, extname) kwargs.pop('type',None) outliers = outliers_cls(**kwargs) outliers._finish_read(fits, extname) return outliers
def _finish_read(self, fits, extname): """Finish the reading process with any class-specific steps. The base class implementation doesn't do anything, which is often appropriate, but this hook exists in case any Outliers classes need to read extra information from the fits file. :param fits: An open fitsio.FITS object. :param extname: The base name of the extension. """ pass
class MADOutliers(Outliers): # pragma: no cover (This isn't functional yet.) """An Outliers handler using mean absolute deviation (MAD) for defining outliers. MAD = < |x - median] > The range of values to keep in the sample are [median - nmad * MAD, median + nmad * MAD] where nmad is a parameter specified by the user. The user can specify this parameter in one of two ways. 1. The user can specify nmad directly. 2. The user can specify nsigma, in which case nmad = sqrt(pi/2) nsigma, the equivalent value for a Gaussian distribution. Either nmad or nsigma must be provided. :param nmad: The number of mean absolute deviations from the median at which something is declared an outlier. :param nsigma: The number of sigma equivalent if the underlying distribution is Gaussian. """ def __init__(self, nmad=None, nsigma=None): if nmad is None and nsigma is None: raise TypeError("Either nmad or nsigma is required") if nsigma is not None: nmad = math.sqrt(math.pi/2) * nsigma self.nmad = nmad self.kwargs = { 'nmad' : nmad, } def removeOutliers(self, stars): """Remove outliers from a list of stars based on their fit parameters. :param stars: A list of Star instances :returns: A new list of stars without outliers """ # I started writing this class, but then realized this isn't what we want to do # for the PixelGrid model. So leave this for now... raise NotImplementedError("MAD algorithm not implemented")
[docs]class ChisqOutliers(Outliers): """An Outliers handler using the chisq of the residual of the interpolated star with the original. The user can specify the threshold in one of four ways: 1. The user can specify thresh directly. 2. The user can specify ndof to give a multiple of the number of degrees of freedom of the model, thresh = ndof * dof. 3. The user can specify prob to reject according to the probability that the chisq distribution for the model's number of degrees of freedom would exceed chisq. 4. The user can specify nsigma, in which case thresh is calculated according to the chisq distribution to give the equivalent rejection probability that corresponds to that many sigma. Exactly one of thresh, ndof, nsigma, prob must be provided. There is an option to include reserve stars in the outlier rejection, which is enabled by setting ``include_reserve=True``. This is probably not a good idea normally. Reserve stars are often preferentially targeted by the outlier removal, which somewhat lessens their case as fair test points for diagnostics. However, it is still an option in case you want to use it. :param thresh: The threshold in chisq above which an object is declared an outlier. :param ndof: The threshold as a multiple of the model's dof. :param prob: The probability limit that a chisq distribution with the model's dof would exceed the given value. :param nsigma: The number of sigma equivalent for the probability that a chisq distribution would exceed the given value. :param max_remove: The maximum number of outliers to remove on each iteration. If this is a float < 1.0, then this is interpreted as a maximum fraction of stars to remove. e.g. 0.01 will remove at most 1% of the stars. [default: None] :param include_reserve: Whether to include reserve stars as potential stars to be removed as outliers. [default: False] """ def __init__(self, thresh=None, ndof=None, prob=None, nsigma=None, max_remove=None, include_reserve=False): if all( (thresh is None, ndof is None, prob is None, nsigma is None) ): raise TypeError("One of thresh, ndof, prob, or nsigma is required.") if thresh is not None and any( (ndof is not None, prob is not None, nsigma is not None) ): raise TypeError("Only one of thresh, ndof, prob, or nsigma may be given.") if ndof is not None and any( (prob is not None, nsigma is not None) ): raise TypeError("Only one of thresh, ndof, prob, or nsigma may be given.") if prob is not None and nsigma is not None: raise TypeError("Only one of thresh, ndof, prob, or nsigma may be given.") # The only one of these we can convert now is nsigma, which we can convert into prob. # Going from either prob or ndof to thresh requires knowledge of dof. if nsigma is not None: prob = math.erfc(nsigma / 2**0.5) self.thresh = thresh self.ndof = ndof self.prob = prob self.max_remove = max_remove self.include_reserve = include_reserve self.kwargs = { 'thresh' : thresh, 'ndof' : ndof, 'prob' : prob, 'max_remove' : max_remove, 'include_reserve' : include_reserve, } def _get_thresh(self, dof): if self.thresh is not None: return self.thresh elif self.ndof is not None: return self.ndof * dof else: return chi2.isf(self.prob, dof)
[docs] def removeOutliers(self, stars, logger=None): """Remove outliers from a list of stars based on their chisq values. :param stars: A list of Star instances :param logger: A logger object for logging debug info. [default: None] :returns: stars, nremoved A new list of stars without outliers, and how many outliers were removed. """ logger = galsim.config.LoggerWrapper(logger) if self.include_reserve: use_stars = stars else: use_stars = [ s for s in stars if not s.is_reserve ] nstars = len(use_stars) logger.debug("Checking %d stars for outliers", nstars) chisq = np.array([ s.fit.chisq for s in use_stars ]) dof = np.array([ s.fit.dof for s in use_stars ]) # Scale up threshold by global chisq/dof. factor = np.sum(chisq) / np.sum(dof) if factor < 1: factor = 1 thresh = np.array([ self._get_thresh(d) for d in dof ]) * factor if np.all(dof == dof[0]): logger.debug("dof = %f, thresh = %f * %f = %f", dof[0], self._get_thresh(dof[0]), factor, thresh[0]) else: min_dof = np.min(dof) max_dof = np.max(dof) min_thresh = self._get_thresh(min_dof) max_thresh = self._get_thresh(max_dof) logger.debug("Minimum dof = %d with thresh = %f * %f = %f", min_dof, min_thresh, factor, min_thresh*factor) logger.debug("Maximum dof = %d with thresh = %f * %f = %f", max_dof, max_thresh, factor, max_thresh*factor) nremoved = np.sum(~(chisq <= thresh)) # Write it as not chisq <= thresh in case of nans. if nremoved == 0: return stars, 0 logger.info("Found %d stars with chisq > thresh", nremoved) # Update max_remove if necessary max_remove = self.max_remove if max_remove is not None and 0 < max_remove < 1: max_remove = int(math.ceil(max_remove * len(stars))) if max_remove is None or nremoved <= max_remove: good = chisq <= thresh good_stars = [ s for g, s in zip(good, use_stars) if g ] else: # Since the thresholds are not necessarily all equal, this might be tricky to # figure out which ones should be removed. # e.g. if max_remove == 1 and we have items with # chisq = 20, thresh = 15 # chisq = 40, thresh = 32 # which one should we remove? # The first has larger chisq/thresh, and the second has larger chisq - thresh. # I semi-arbitrarily remove based on the difference. nremoved = max_remove diff = chisq - thresh new_thresh_index = np.argpartition(diff, -nremoved)[-nremoved] new_thresh = diff[new_thresh_index] good = diff < new_thresh good_stars = [ s for g, s in zip(good, use_stars) if g ] # Add back the reserve stars if we aren't including them in the cut. if not self.include_reserve: good_stars += [ s for s in stars if s.is_reserve ] logger.debug("chisq = %s",chisq[~(chisq <= thresh)]) logger.debug("thresh = %s",thresh[~(chisq <= thresh)]) logger.debug("flux = %s",[s.flux for g,s in zip(good,stars) if not g]) assert nremoved == len(stars) - len(good_stars) return good_stars, nremoved