Source code for pyddem.fit_tools

"""
pyddem.fit_tools provides tools to derive filter and interpolate DEM stacks into elevation time series
"""
import os
import sys
import time

os.environ["OMP_NUM_THREADS"] = "1"  # export OMP_NUM_THREADS=4
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # export OPENBLAS_NUM_THREADS=4
os.environ["MKL_NUM_THREADS"] = "1"  # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"  # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = "1"  # export NUMEXPR_NUM_THREADS=6
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
import numpy as np
import gdal
from dask.diagnostics import ProgressBar
import pandas as pd
import functools
import xarray as xr
import geopandas
import matplotlib.pylab as plt
import multiprocessing as mp
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import animation
from mpl_toolkits.axes_grid1 import make_axes_locatable
from itertools import chain
from scipy import stats
from scipy.interpolate import interp1d
from scipy.ndimage import filters
from skimage.morphology import disk
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, RationalQuadratic as RQ, ExpSineSquared as ESS, \
    PairwiseKernel
from numba import jit, vectorize, float32
from llc import jit_filter_function
from pybob.GeoImg import GeoImg
from pybob.coreg_tools import get_slope
from pybob.image_tools import create_mask_from_shapefile
from pybob.plot_tools import set_pretty_fonts
from pybob.bob_tools import mkdir_p
from pybob.plot_tools import plot_polygon_df
import pyddem.stack_tools as st
import pyddem.tdem_tools as tt
import pyddem.vector_tools as vt
from pybob.ddem_tools import nmad
from warnings import filterwarnings

filterwarnings('ignore')


[docs]def make_dh_animation(ds, fn_shp=None,month_a_year=None, rates=False, figsize=(10,10), t0=None, t1=None, dh_max=20, var='z', cmap='RdYlBu',
                      xlbl='easting (km)',
                      ylbl='northing (km)'):
    """
       Generate a GIF from elevation time series

       :param ds: xarray Dataset of elevation time series
       :param month_a_year: Numerical month to keep only one month per year in the animation
       :param rates: Display rates instead of cumulative change
       :param figsize: Tuple of figure size
       :param t0: Starting date
       :param t1: End date
       :param dh_max: Max scale for colorbar
       :param var: Variable to display in the dataset
       :param cmap: Colormap
       :param xlbl: xlabel for animation
       :param ylbl: ylabel for animation

       :returns: Figure, List of images and annotations for animation
       """

    set_pretty_fonts()
    fig = plt.figure(figsize=figsize)
    ax = fig.gca()

    ds_sub = ds.sel(time=slice(t0, t1))

    if month_a_year is not None:
        t_vals = ds_sub.time.values
        y0 = t_vals[0].astype('datetime64[D]').astype(object).year
        y1 = t_vals[-1].astype('datetime64[D]').astype(object).year

        t_vec = []
        for y in np.arange(y0, y1, 1):
            t = np.datetime64(str(y) + '-' + str(month_a_year).zfill(2) + '-01')
            td = np.array([(t - t_vals[i].astype('datetime64[D]')).astype(int) for i in range(len(t_vals))])

            closer_dat = t_vals[(np.abs(td) == np.min(np.abs(td)))][0]
            t_vec.append(closer_dat)

        ds_sub = ds.sel(time=t_vec)
        # mid = int(np.floor(len(ds_sub.time.values)/2))

    if var == 'z':
        if rates:
            dh_ = np.diff(ds_sub.variables[var].values,axis=0)
        else:
            dh_ = ds_sub.variables[var].values - ds_sub.variables[var].values[0]
    elif var == 'z_ci':
        dh_ = ds_sub.variables[var].values

    times = np.array([np.datetime_as_string(t.astype('datetime64[D]')) for t in ds_sub.time.values])
    nice_ext = np.array([ds.x.values.min(), ds.x.values.max(), ds.y.values.min(), ds.y.values.max()]) / 1000
    ims = []

    im = ax.imshow(dh_[0], extent=nice_ext, vmin=-dh_max, vmax=dh_max, cmap=cmap)
    if rates:
        annot = times[0]+'-'+times[1]
    else:
        annot = times[0]
    ann = ax.annotate(annot, xy=(0.05, 0.05), xycoords='axes fraction', fontsize=20,
                      fontweight='bold', color='black', family='monospace')
    ims.append([im, ann])

    tmp_geoimg = st.make_geoimg(ds)
    df = geopandas.read_file(fn_shp).to_crs({'init':'epsg:'+str(tmp_geoimg.epsg)})
    fig, _ = plot_polygon_df(df,fig=fig,ax=ax,facecolor='None',edgecolor='black',zorder=30)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(im, cax=cax, extend='both')


    cax.set_ylabel('elevation change (m)')
    ax.set_ylabel(ylbl)
    ax.set_xlabel(xlbl)

    if rates:
        l = len(times[1:])-1
    else:
        l = len(times[1:])
    for i in range(l):
        im = ax.imshow(dh_[i + 1], vmin=-dh_max, vmax=dh_max, cmap=cmap, extent=nice_ext)
        if rates:
            annot = times[i+1] + '-' + times[i+2]
        else:
            annot = times[i+1]
        ann = ax.annotate(annot, xy=(0.05, 0.05), xycoords='axes fraction', fontsize=20,
                          fontweight='bold', color='black', family='monospace')
        ims.append([im, ann])

    plt.tight_layout()

    return fig, ims


def write_animation(fig, ims, outfilename='output.gif', ani_writer='imagemagick', **kwargs):
    ani = animation.ArtistAnimation(fig, ims, **kwargs)
    ani.save(outfilename, writer=ani_writer)


[docs]def get_dem_date(ds, ds_filt, t, outname):
    """
       Extract a DEM from an elevation time series, with elevation error and time lag to the closest observation

       :param ds: xarray Dataset of elevation time series
       :param ds_filt: xarray dataset of boolean data cube of valid observation used to generate the time series
       ;param t: Date of extraction
       :outname: Filename for output rasters

       :returns:
       """

    tmp_img = st.make_geoimg(ds)

    dc = ds.interp(time=[t])

    h = dc.variables['z'].values[0]
    err = dc.variables['z_ci'].values[0]

    dates = [t]
    t_vals = list(ds_filt.time.values)
    dates_rm_dupli = sorted(list(set(t_vals)))
    ind_firstdate = []
    for i, date in enumerate(dates_rm_dupli):
        ind_firstdate.append(t_vals.index(date))
    ds_filt2 = ds_filt.isel(time=np.array(ind_firstdate))
    for i in range(len(dates_rm_dupli)):
        t_ind = (t_vals == dates_rm_dupli[i])
        if len(t_ind) > 1:
            ds_filt2.z.values[i, :] = np.any(ds_filt.z[t_vals == dates_rm_dupli[i], :].values.astype(bool), axis=0)
    y0 = np.datetime64('2000-01-01')
    ftime = ds_filt2.time.values
    ftime_delta = np.array([t - y0 for t in ftime])
    days = [td.astype('timedelta64[D]').astype(int) for td in ftime_delta]

    # reindex to get closest not-NaN time value of the date vector
    filt_arr = np.array(ds_filt2.z.values, dtype='float32')
    filt_arr[filt_arr == 0] = np.nan
    days = np.array(days)
    filt_arr = filt_arr * days[:, None, None]
    at_least_2 = np.count_nonzero(~np.isnan(filt_arr), axis=0) >= 2
    filt_tmp = filt_arr[:, at_least_2]
    out_arr = np.copy(filt_tmp)
    for i in range(np.shape(filt_tmp)[1]):
        ind = ~np.isnan(filt_tmp[:, i])
        fn = interp1d(days[ind], filt_tmp[:, i][ind], kind='nearest', fill_value='extrapolate', assume_sorted=True)
        out_arr[:, i] = fn(days)
    filt_arr[:, at_least_2] = out_arr
    ds_filt2.z.values = filt_arr
    ds_filt_sub = ds_filt2.reindex(time=dates, method='nearest')

    for i in range(len(dates)):
        date = dates[i]
        day_diff = (date - y0).astype('timedelta64[D]').astype(int)
        ds_filt_sub.z.values[i, :] = np.abs(ds_filt_sub.z.values[i, :] - np.ones(ds_filt_sub.z.shape[1:3]) * day_diff)

    dt = ds_filt_sub.z.values[0]

    tmp_img.img = h
    tmp_img.write(os.path.join(os.path.dirname(outname), os.path.basename(outname) + '_AST_SRTM_h.tif'))
    tmp_img.img = err
    tmp_img.write(os.path.join(os.path.dirname(outname), os.path.basename(outname) + '_AST_SRTM_herr.tif'))
    tmp_img.img = dt
    tmp_img.write(os.path.join(os.path.dirname(outname), os.path.basename(outname) + '_AST_SRTM_dt.tif'))


def get_dem_date_exact(ds, t, outname):
    tmp_img = st.make_geoimg(ds)

    times = sorted(list(set(list(ds.time.values))))
    df_time = pd.DataFrame()
    df_time = df_time.assign(time=times)
    df_time.index = pd.DatetimeIndex(pd.to_datetime(times))
    t_exact = df_time.iloc[df_time.index.get_loc(pd.to_datetime(t), method='nearest')][0]

    dc = ds.sel(time=[t_exact])

    h = dc.variables['z'].values[0]

    tmp_img.img = h
    tmp_img.write(os.path.join(os.path.dirname(outname), os.path.basename(outname) + '_ASTER_recons_h.tif'))


def get_full_dh(ds, outname, t0=None, t1=None):
    tmp_img = st.make_geoimg(ds)

    #filter pixel with 2000-2019 error superior than 500 m (very large, because it is overestimated in many instances)
    err_tot = np.sqrt(ds.variables['z_ci'].values[-1] ** 2 + ds.variables['z_ci'].values[0] ** 2)
    ind = err_tot > 500.

    dc = ds.sel(time=slice(t0, t1))

    dh = dc.variables['z'].values[-1] - dc.variables['z'].values[0]
    err = np.sqrt(dc.variables['z_ci'].values[-1] ** 2 + dc.variables['z_ci'].values[0] ** 2)

    dh[ind] = np.nan
    err[ind] = np.nan

    mkdir_p(os.path.join(os.path.dirname(outname), 'dh'))
    mkdir_p(os.path.join(os.path.dirname(outname), 'dh_err'))

    period = str(t0) + '_' + str(t1)
    tmp_img.img = dh
    tmp_img.write(os.path.join(os.path.dirname(outname), 'dh', os.path.basename(outname) + '_'+period+'_dh.tif'))
    tmp_img.img = err
    tmp_img.write(os.path.join(os.path.dirname(outname), 'dh_err',os.path.basename(outname) + '_'+period+'_dh_err.tif'))

def reproj_build_vrt(list_dh, utm, out_vrt, res):
    epsg = vt.epsg_from_utm(utm)

    list_fn_out = []
    for dh in list_dh:

        tile_name = os.path.basename(dh).split('_')[0]
        print(tile_name)
        img = GeoImg(dh)
        dest = gdal.Warp('', img.gd, format='MEM', dstSRS='EPSG:{}'.format(epsg), resampleAlg=gdal.GRA_NearestNeighbour,
                         xRes=res, yRes=res)

        try:
            out_img = GeoImg(dest)
        except:
            print('Could not reproject in that projection.')
            continue
        nodata_mask = vt.latlontile_nodatamask(out_img, tile_name)
        out_img.img[~nodata_mask] = np.nan

        fn_out = os.path.join(os.path.dirname(dh), os.path.splitext(os.path.basename(dh))[0] + '_' + str(epsg) + '.tif')
        list_fn_out.append(fn_out)
        out_img.write(fn_out)

    gdal.BuildVRT(out_vrt, list_fn_out, resampleAlg='bilinear')


def get_stack_mask(maskshp, ds):
    npix_y, npix_x = ds['z'][0].shape
    dx = np.round((ds.x.max().values - ds.x.min().values) / float(npix_x))
    dy = np.round((ds.y.min().values - ds.y.max().values) / float(npix_y))

    newgt = (ds.x.min().values - 0, dx, 0, ds.y.max().values - 0, 0, dy)

    drv = gdal.GetDriverByName('MEM')
    dst = drv.Create('', npix_x, npix_y, 1, gdal.GDT_Float32)

    sp = dst.SetProjection(ds.crs.spatial_ref)
    sg = dst.SetGeoTransform(newgt)

    wa = dst.GetRasterBand(1).WriteArray(ds['z'][0].values)
    md = dst.SetMetadata({'Area_or_point': 'Point'})
    del wa, sg, sp, md

    img = GeoImg(dst)
    mask = create_mask_from_shapefile(img, maskshp)
    return mask


[docs]def vgm_1d(argsin):
    """
    1D variogram sampling

    :param argsin: tuple of x values, y values, lag cutoff value and interval of x sampling (for easier parallel proc)

    :returns: semivariance
    """

    t_vals, detrend_elev, lag_cutoff, tstep = argsin

    # 1D variogram: inspired by http://connor-johnson.com/2014/03/20/simple-kriging-in-python/

    def SVh(P, h, bw):
        # empirical variogram for a single lag
        pd = np.abs(np.subtract.outer(P[:, 0], P[:, 0]))
        N = pd.shape[0]
        Z = list()
        for i in range(N):
            for j in range(i + 1, N):
                if (pd[i, j] >= h - bw) and (pd[i, j] <= h + bw):
                    Z.append((P[i, 1] - P[j, 1]) ** 2.0)
        if len(Z) > 0:
            return np.nansum(Z) / (2.0 * len(Z)), len(Z)
        else:
            return np.nan, 0

    def SV(P, hs, bw):
        # empirical variogram for a collection of lags
        sv = list()
        p = list()
        for h in hs:
            svh, ph = SVh(P, h, bw)
            sv.append(svh)
            p.append(ph)
        sv = [[sv[i], p[i]] for i in range(len(hs))]
        return np.array(sv).T

    ind_valid = ~np.isnan(detrend_elev)
    sample = np.column_stack((t_vals[ind_valid], detrend_elev[ind_valid]))

    hs = np.arange(0, lag_cutoff, tstep)
    sv = SV(sample, hs, tstep)

    return sv


def wrapper_vgm1d(argsin):
    t_vals, stack_detrend_elev, k, lag_cutoff, tstep = argsin

    _, nb_iter = np.shape(stack_detrend_elev)

    print('Pack of ' + str(nb_iter) + ' variograms number ' + str(k))

    lags = np.arange(0, lag_cutoff, tstep)

    vdata = np.zeros((len(lags), nb_iter))
    pdata = np.zeros((len(lags), nb_iter))
    for i in np.arange(nb_iter):
        sv = vgm_1d((t_vals, stack_detrend_elev[:, i], lag_cutoff, tstep))
        vdata[:, i] = sv[0]
        pdata[:, i] = sv[1]

    return vdata, pdata


[docs]def vgm_1d_med(argsin):
    """
    1D median variogram sampling

    :param argsin: tuple of x values, y values, lag cutoff value and interval of x sampling (for easier parallel proc)

    :returns: semivariance
    """

    t_vals, detrend_elev, lag_cutoff, tstep = argsin

    # 1D variogram: inspired by http://connor-johnson.com/2014/03/20/simple-kriging-in-python/

    def SVh(P, h, bw):
        # empirical variogram for a single lag
        pd = np.abs(np.subtract.outer(P[:, 0], P[:, 0]))
        N = pd.shape[0]
        Z = list()
        for i in range(N):
            for j in range(i + 1, N):
                if (pd[i, j] >= h - bw) and (pd[i, j] <= h + bw):
                    Z.append((P[i, 1] - P[j, 1]) ** 2.0)
        if len(Z) > 0:
            return np.array(Z)
        else:
            return np.array([np.nan])

    def SV(P, hs, bw):
        # empirical variogram for a collection of lags
        sv = list()
        for h in hs:
            svh = SVh(P, h, bw)
            sv.append(svh)
        sv = [sv[i] for i in range(len(hs))]
        return sv

    ind_valid = ~np.isnan(detrend_elev)
    sample = np.column_stack((t_vals[ind_valid], detrend_elev[ind_valid]))

    hs = np.arange(0, lag_cutoff, tstep)
    sv = SV(sample, hs, tstep)

    return sv


def wrapper_vgm1d_med(argsin):
    t_vals, stack_detrend_elev, k, lag_cutoff, tstep = argsin

    _, nb_iter = np.shape(stack_detrend_elev)

    print('Pack of ' + str(nb_iter) + ' variograms number ' + str(k))

    lags = np.arange(0, lag_cutoff, tstep)

    vlist = list()
    for i in np.arange(nb_iter):
        sv = vgm_1d_med((t_vals, stack_detrend_elev[:, i], lag_cutoff, tstep))
        vlist.append(sv)

    vzipped = list(zip(*vlist))

    vdata = []
    for i in range(len(vlist[0])):
        vdata.append(np.concatenate(vzipped[i]))

    return vdata


def wrapper_mask(argsin):
    mask, arr_mask, cube_mask = argsin

    out_mask = np.logical_and.reduce(
        (mask, cube_mask, arr_mask[None, :, :] * np.ones(np.shape(mask)[0], dtype=bool)[:, None, None]))

    return out_mask


def wrapper_estimate_var(argsin):
    dh, bins = argsin

    print('Binning slope: ' + str(bins[0][0]) + ' to ' + str(bins[0][1])
          + ' degrees and binning correlation: ' + str(bins[1][0]) + ' to ' + str(bins[1][1]) + ' percent.')

    start = time.time()
    nsamp = 10000
    print('Selecting a subsample of ' + str(nsamp) + ' points...')

    # sample a subset (usually on stable terrain)
    max_samp = len(dh)
    final_nsamp = min(max_samp, nsamp)
    subset = np.random.choice(max_samp, final_nsamp, replace=False)
    sub_dh = dh[subset]

    nmd = nmad(sub_dh)
    ns = final_nsamp
    std = np.nanstd(sub_dh)

    print('Elapsed for bin: ' + str(time.time() - start))

    return nmd, ns, std


[docs]def get_var_by_corr_slope_bins(ds, ds_arr, arr_slope, bins_slope, cube_corr, bins_corr, outfile, inc_mask=None,
                               exc_mask=None, nproc=1):
    """
    Sampling method for elevation measurement error from stacks of DEMs: binning of external variables and sampling of variance

    :param ds: xarray Dataset of data cube
    :param ds_arr: Data cube of elevations
    :param arr_slope: Slope raster
    :param bins_slope: Bins to sample categories of slope
    :param cube_corr: Data cube of stereo-correlations
    :param bins_corr: Bins to sample categories of stereo-correlations
    :param outfile: Filename for output files
    :param inc_mask: Filename of inclusion shapefile for the sampling of variance
    :param exc_mask: Filename of exclusion shapefile for the sampling of variance
    :param nproc: Number of cores for multiprocessing [1]

    :returns:
    """

    ref_arr = ds.variables['ref_z'].values

    mask = np.ones(np.shape(ref_arr), dtype=bool)
    if inc_mask is not None:
        mask = np.logical_and(get_stack_mask(inc_mask, ds), mask)
    if exc_mask is not None:
        mask = np.logical_and(~get_stack_mask(exc_mask, ds), mask)

    dh = ds_arr - ref_arr[None, :, :]

    mask_init = np.logical_and(mask[None, :, :] * np.ones(np.shape(dh)[0], dtype=bool)[:, None, None], np.isfinite(dh))

    list_arr_mask = [np.logical_and(np.abs(arr_slope) >= bins_slope[i], np.abs(arr_slope) < bins_slope[i + 1]) for i in
                     range(len(bins_slope) - 1)]
    list_cube_mask = [np.logical_and(np.abs(cube_corr) >= bins_corr[j], np.abs(cube_corr) < bins_corr[j + 1]) for j in
                      range(len(bins_corr) - 1)]

    if nproc == 1:
        print('Using 1 proc to derive variance...')
        list_nmad, list_ns, list_std, list_bin_slope, list_bin_corr, list_id = ([] for i in range(6))
        for i in range(len(bins_slope) - 1):
            for j in range(len(bins_corr) - 1):
                print('Binning slope: ' + str(bins_slope[i]) + ' to ' + str(bins_slope[i + 1])
                      + ' degrees and binning correlation: ' + str(bins_corr[j]) + ' to ' + str(
                    bins_corr[j + 1]) + ' percent.')

                slope_mask = list_arr_mask[i]
                corr_mask = list_cube_mask[j]

                nmd, ns, std = estimate_var(dh, mask_init, arr_mask=slope_mask, cube_mask=corr_mask, nsamp=10000)

                list_nmad.append(nmd)
                list_ns.append(ns)
                list_std.append(std)
                list_id.append('slope: ' + str(bins_slope[i]) + '-'
                               + str(bins_slope[i + 1]) + ',corr:' + str(bins_corr[j]) + '-' + str(bins_corr[j + 1]))
                list_bin_slope.append(bins_slope[i] + (bins_slope[i + 1] - bins_slope[i]) / 2)
                list_bin_corr.append(bins_corr[j] + (bins_corr[j + 1] - bins_corr[j]) / 2)
    else:
        # TODO: does not work, even with np.copy()...

        print('Using ' + str(nproc) + ' procs to derive variance...')
        pool = mp.Pool(nproc, maxtasksperchild=1)

        # print('Creating bin masks...')
        # argsin_mask = [(mask_init,list_arr_mask[i],list_cube_mask[j]) for i in range(len(bins_slope)-1) for j in range(len(bins_corr)-1)]
        # list_mask = pool.map(wrapper_mask,argsin_mask,chunksize=1)
        # pool.close()
        # pool.join()
        # pool = mp.Pool(nproc,maxtasksperchild=1)

        list_mask = [np.logical_and.reduce((mask_init, list_cube_mask[j],
                                            list_arr_mask[i][None, :, :] * np.ones(np.shape(dh)[0], dtype=bool)[:, None,
                                                                           None])) for i in range(len(bins_slope) - 1)
                     for j in range(len(bins_corr) - 1)]

        list_dh = [dh[m] for m in list_mask]
        list_bins = [((bins_slope[i], bins_slope[i + 1]), (bins_corr[j], bins_corr[j + 1])) for i in
                     range(len(bins_slope) - 1) for j in range(len(bins_corr) - 1)]
        argsin = [(list_dh[i], list_bins[i]) for i in range(len(list_dh))]
        print('Deriving variance...')
        outputs = pool.map(wrapper_estimate_var, argsin, chunksize=1)
        pool.close()
        pool.join()

        zipped = list(zip(*outputs))

        list_nmad = zipped[0]
        list_ns = zipped[1]
        list_std = zipped[2]
        list_id = ['slope: ' + str(bins_slope[i]) + '-' + str(bins_slope[i + 1]) + ',corr:' + str(bins_corr[j]) + '-'
                   + str(bins_corr[j + 1]) for i in range(len(bins_slope) - 1) for j in range(len(bins_corr) - 1)]
        list_bin_slope = [(bins_slope[i] + (bins_slope[i + 1] - bins_slope[i]) / 2) for i in range(len(bins_slope) - 1)
                          for j in range(len(bins_corr) - 1)]
        list_bin_corr = [(bins_corr[j] + (bins_corr[j + 1] - bins_corr[j]) / 2) for i in range(len(bins_slope) - 1) for
                         j in range(len(bins_corr) - 1)]

    df = pd.DataFrame()
    df = df.assign(bin_slope=list_bin_slope, bin_corr=list_bin_corr, nmad=list_nmad, std=list_std, nsamp=list_ns,
                   id=list_id)

    df.to_csv(outfile)


def get_var_by_bin(ds, ds_arr, arr_vals, bin_vals, outfile, inc_mask=None, exc_mask=None, rast_cube_mask=False):
    ref_arr = ds.variables['ref_z'].values

    dh = ds_arr - ref_arr[None, :, :]

    mask = np.ones(np.shape(ref_arr), dtype=bool)
    if inc_mask is not None:
        mask = np.logical_and(get_stack_mask(inc_mask, ds), mask)
    if exc_mask is not None:
        mask = np.logical_and(~get_stack_mask(exc_mask, ds), mask)

    mask_init = np.logical_and(mask[None, :, :] * np.ones(np.shape(dh)[0], dtype=bool)[:, None, None], np.isfinite(dh))

    df_all = pd.DataFrame()
    for i in range(len(bin_vals) - 1):
        print('Binning from ' + str(bin_vals[i]) + ' to ' + str(bin_vals[i + 1]))

        if not rast_cube_mask:
            arr_mask = np.logical_and(np.abs(arr_vals) >= bin_vals[i], np.abs(arr_vals) < bin_vals[i + 1])
            cube_mask = None
        else:
            arr_mask = None
            cube_mask = np.logical_and(np.abs(arr_vals) >= bin_vals[i], np.abs(arr_vals) < bin_vals[i + 1])

        nmd, ns, std = estimate_var(dh, mask_init, arr_mask=arr_mask, cube_mask=cube_mask, nsamp=10000)

        bin_id = str(bin_vals[i]) + '-' + str(bin_vals[i + 1])
        df = pd.DataFrame()
        df = df.assign(nmad=[nmd], std=[std], bin_val=[bin_vals[i] + 0.5 * (bin_vals[i + 1] - bin_vals[i])], nsamp=[ns],
                       id=[bin_id])
        df_all = df_all.append(df, ignore_index=True)

    df_all.to_csv(outfile)


[docs]def estimate_var(dh, mask_cube, arr_mask=None, cube_mask=None, nsamp=100000):
    """
    Estimation of elevation variance from data cube of difference to reference elevations

    :param dh: Data cube of difference between stack of DEMs and reference DEM
    :param mask_cube: Mask where sampling must occur
    :param arr_mask: Add a raster mask (no time dimension)
    :param cube_mask: Add a data cube mask
    :param nsamp: Number of samples to draw

    :returns: NMAD, number of samples, STD
    """

    start = time.time()

    # rasterize mask
    if arr_mask is not None:
        mask_cube = np.logical_and(mask_cube,
                                   arr_mask[None, :, :] * np.ones(np.shape(dh)[0], dtype=bool)[:, None, None])

    if cube_mask is not None:
        mask_cube = np.logical_and(mask_cube, cube_mask)

    print('Selecting a subsample of ' + str(nsamp) + ' points...')
    # sample a subset (usually on stable terrain)
    max_samp = np.count_nonzero(mask_cube)
    index = np.where(mask_cube)
    final_nsamp = min(max_samp, nsamp)
    subset = np.random.choice(max_samp, final_nsamp, replace=False)
    index_subset = (index[0][subset], index[1][subset], index[2][subset])
    mask_subset = np.zeros(np.shape(mask_cube), dtype=np.bool)
    mask_subset[index_subset] = True

    sub_dh = dh[mask_subset]

    print('Elapsed for bin: ' + str(time.time() - start))

    return nmad(sub_dh), final_nsamp, np.nanstd(sub_dh)


[docs]def manual_refine_sampl_temporal_vgm(fn_stack, fn_ref_dem, out_dir, filt_ref='both', max_dhdt=[-50, 50]
                                     , ref_dem_date=np.datetime64('2015-01-01'), inc_mask=None, gla_mask=None, nproc=1):
    """
    Full sampling method of temporal variogram from stacks of DEMs: pre-filtering of data, binning of external variables and sampling of temporal variograms

    :param fn_stack: Filename of netcdf stack of DEMs
    :param fn_ref_dem: Filename of reference DEM
    :param out_dir: Output directory
    :param filt_ref: Filtering method
    :param max_dhdt: Maximum positive/negative elevation change rate per year from reference elevations [-50 m,50 m]
    :param ref_dem_date: Date of reference DEM
    :param inc_mask: Filename of inclusion shapefile for the stack
    :param gla_mask: Filename of inclusion shapefile for the sampling of variogram
    :param nproc: Number of cores for multiprocessing [1]

    :returns:
    """

    # let's look at a few possible dependencies for this temporal vgm
    print('Working on ' + fn_stack)

    ds = xr.open_dataset(fn_stack)
    ds.load()
    start = time.time()

    print('Original temporal size of stack is ' + str(ds.time.size))
    print('Original spatial size of stack is ' + str(ds.x.size) + ',' + str(ds.y.size))

    if inc_mask is not None:
        # ds_orig = ds.copy()
        ds = isel_maskout(ds, inc_mask)
        if ds is None:
            print('Inclusion mask has no valid pixels in this extent. Skipping...')
            return
        print('Temporal size of stack is now: ' + str(ds.time.size))
        print('Spatial size of stack is now: ' + str(ds.x.size) + ',' + str(ds.y.size))

    print('Filtering with max RMSE of 20...')

    keep_vals = ds.uncert.values < 20
    ds = ds.isel(time=keep_vals)
    print('Temporal size of stack is now: ' + str(ds.time.size))

    ds_arr = ds.z.values
    t_vals = ds.time.values

    # pre-filtering
    if fn_ref_dem is not None:
        assert filt_ref in ['min_max', 'time', 'both'], "fn_ref must be one of: min_max, time, both"
        ds_arr_filt = prefilter_stack(ds, ds_arr, fn_ref_dem, t_vals, filt_ref=filt_ref, ref_dem_date=ref_dem_date,
                                      max_dhdt=max_dhdt, nproc=nproc)
        print('Elapsed time is ' + str(time.time() - start))

    fn_final = os.path.join(os.path.dirname(fn_stack), os.path.splitext(os.path.basename(fn_stack))[0] + '_final.nc')
    fn_dh = os.path.join(os.path.dirname(fn_stack), os.path.splitext(os.path.basename(fn_stack))[0] + '_final_dh.tif')
    # TODO: remove when all those are corrected
    if not os.path.exists(fn_dh):
        fn_dh = os.path.join(os.path.dirname(fn_stack),
                             os.path.splitext(os.path.basename(fn_stack))[0] + '_final.nc_dh.tif')

    ds_final = xr.open_dataset(fn_final)
    arr_slope = ds_final.slope.values
    arr_dh = GeoImg(fn_dh).img
    tmp_geo = st.make_geoimg(ds)
    tmp_dem = GeoImg(fn_ref_dem)
    ref_dem = tmp_dem.reproject(tmp_geo).img

    bins_slope = np.arange(0, 60, 10)
    bins_dh = [-300, -200, -100, -50, -20, -10, 0, 10, 20, 50]
    # bins_dh=[-20,-10]
    bins_elev = np.arange(np.nanmin(ref_dem) - np.nanmin(ref_dem) % 200, np.nanmax(ref_dem), 200)

    fn_slope_tvar = os.path.join(out_dir, os.path.splitext(os.path.basename(fn_stack))[0] + '_slope_tvar.csv')
    fn_dh_tvar = os.path.join(out_dir, os.path.splitext(os.path.basename(fn_stack))[0] + '_dh_tvar.csv')
    fn_elev_tvar = os.path.join(out_dir, os.path.splitext(os.path.basename(fn_stack))[0] + '_elev_tvar.csv')

    get_vgm_by_bin(ds, ds_arr, arr_slope, bins_slope, fn_slope_tvar, inc_mask=gla_mask, nproc=nproc)
    get_vgm_by_bin(ds, ds_arr, ref_dem, bins_elev, fn_elev_tvar, inc_mask=gla_mask, nproc=nproc)
    get_vgm_by_bin(ds, ds_arr_filt, arr_dh, bins_dh, fn_dh_tvar, inc_mask=gla_mask, nproc=nproc)


def get_vgm_by_bin(ds, ds_arr, arr_vals, bin_vals, outfile, inc_mask=None, exc_mask=None, nproc=1):
    df_all = pd.DataFrame()
    for i in range(len(bin_vals) - 1):
        print('Binning from ' + str(bin_vals[i]) + ' to ' + str(bin_vals[i + 1]))

        add_mask = np.logical_and(arr_vals >= bin_vals[i], arr_vals < bin_vals[i + 1])
        lags, vmean, vstd = estimate_vgm(ds, ds_arr, inc_mask=inc_mask, exc_mask=exc_mask, rast_mask=add_mask,
                                         nproc=nproc, nsamp=10000)
        id = str(bin_vals[i]) + '-' + str(bin_vals[i + 1])

        df = pd.DataFrame()
        df = df.assign(lags=lags, vmean=vmean, vstd=vstd)
        df = df.assign(id=[id] * len(df.index),
                       bin_val=[bin_vals[i] + 0.5 * (bin_vals[i + 1] - bin_vals[i])] * len(df.index))
        df_all = df_all.append(df, ignore_index=True)

    df_all.to_csv(outfile)


[docs]def estimate_vgm(ds, ds_arr, inc_mask=None, exc_mask=None, rast_mask=None, nsamp=10000, tstep=0.25, lag_cutoff=None,
                 min_obs=8, nproc=1, pack_size=50):
    """
    Aggregate 1D temporal variograms for multiple stack pixels: random sampling within a inclusion mask

    :param ds: xarray Dataset of data cube
    :param ds_arr: Data cube of elevations
    :param inc_mask: Filename of inclusion shapefile
    :param exc_mask: Filename of exclusion shapefile
    :param rast_mask: Additional mask (boolean raster)
    :param nsamp: Number of pixels sampled in time
    :param tstep: Time interval for aggregating pairwise time lags
    :param lag_cutoff: Maximum time lag to sample
    :param min_obs: Minimum of valid observation per pixel for sampling
    :param nproc: Number of cores for multiprocessing [1]

    :returns: Lags, Variance, Variance dispersion
    """

    # estimate 1D variogram for multiple pixels: random sampling within mask

    # rasterize mask
    mask = np.ones(np.shape(ds_arr[0]), dtype=bool)

    if inc_mask is not None:
        mask = np.logical_and(get_stack_mask(inc_mask, ds), mask)

    if exc_mask is not None:
        mask = np.logical_and(~get_stack_mask(exc_mask, ds), mask)

    if rast_mask is not None:
        mask = np.logical_and(rast_mask, mask)

    # count number of valid temporal observation for each pixel
    nb_arr = np.nansum(~np.isnan(ds_arr), axis=0)
    mask = np.logical_and(mask, nb_arr >= min_obs)

    print(
        'Selecting a subsample of ' + str(nsamp) + ' points with at least ' + str(min_obs) + ' observations in time...')
    # sample a subset
    max_samp = np.count_nonzero(mask)
    index = np.where(mask)
    final_nsamp = min(max_samp, nsamp)
    subset = np.random.choice(max_samp, final_nsamp, replace=False)
    index_subset = (index[0][subset], index[1][subset])
    mask_subset = np.zeros(np.shape(mask), dtype=np.bool)
    mask_subset[index_subset] = True

    ds_samp = ds_arr[:, mask_subset]
    # ds_tmp = ds.copy()
    # ds_tmp.z.values = ds_arr
    # ds_samp = ds_tmp.isel(y=index[0][subset],x=index[1][subset])['z'].values

    # read and convert time values
    t_vals = ds['time'].values

    y0 = t_vals[0].astype('datetime64[D]').astype(object).year
    y1 = t_vals[-1].astype('datetime64[D]').astype(object).year
    total_delta = np.datetime64('{}-01-01'.format(int(y1))) - np.datetime64('{}-01-01'.format(int(y0)))
    ftime_delta = np.array([t - np.datetime64('{}-01-01'.format(int(y0))) for t in t_vals])
    t_scale = (ftime_delta / total_delta) * (int(y1) - y0)

    if lag_cutoff is None:
        lag_cutoff = np.max(t_scale) - np.min(t_scale)

    lags = np.arange(0, lag_cutoff, tstep) + 0.5 * tstep

    # get variance/lags and number of pairwise/lags for each pixel

    # old method, changing for median more robust to outliers
    # vdata = np.zeros((len(lags), final_nsamp))
    # pdata = np.zeros((len(lags), final_nsamp))
    if nproc == 1:
        print('Drawing variograms with 1 core...')
        for i in np.arange(final_nsamp):
            # old method, changing for median more robust to outliers
            # sv = vgm_1d((t_scale, ds_samp[:,i].flatten(),lag_cutoff,tstep))
            # vdata[:,i]=sv[0]
            # pdata[:,i]=sv[1]
            vdata = vgm_1d((t_scale, ds_samp[:, i].flatten(), lag_cutoff, tstep))
    else:
        print('Drawing variograms with ' + str(nproc) + ' cores...')
        pool = mp.Pool(nproc, maxtasksperchild=1)
        argsin = [(t_scale, ds_samp[:, i:min(i + pack_size, final_nsamp)], k, lag_cutoff, tstep) for k, i in
                  enumerate(np.arange(0, final_nsamp, pack_size))]
        outputs = pool.map(wrapper_vgm1d_med, argsin, chunksize=1)
        pool.close()
        pool.join()

        zip_out = list(zip(*outputs))

        vdata = []
        if len(outputs) > 0:
            for i in range(len(outputs[0])):
                vdata.append(np.concatenate(zip_out[i]))

        # old method, changing for median more robust to outliers
        # for k, i in enumerate(np.arange(0,final_nsamp,pack_size)):
        #     vdata[:, i:min(i+pack_size,final_nsamp)]=zip_out[0][k]
        #     pdata[:, i:min(i+pack_size,final_nsamp)]=zip_out[1][k]

    # ptot = np.nansum(pdata,axis=1)
    # mean variogram accounting for the number of pairwise comparison in each pixel
    # vmean = np.nansum(vdata * pdata,axis=1) / ptot
    # vmean = np.nanmedian(vdata,axis=1)
    # 'rough' std: between pixels, not accounting for the number of pairwise observation
    # vstd = np.nanstd(vdata,axis=1)

    vmean = np.zeros(len(lags)) * np.nan
    vstd = np.zeros(len(lags)) * np.nan
    if len(vdata) > 0:
        for i in range(len(lags)):
            vmean[i] = np.nanmedian(vdata[i])
            vstd[i] = nmad(vdata[i])

    return lags, vmean, vstd


def plot_vgm(lags, vmean, vstd):
    fig, ax = plt.subplots(1)
    ax.plot(lags, vmean, lw=2, label='mean', color='blue')
    ax.fill_between(lags, vmean + vstd, vmean - vstd, facecolor='blue', alpha=0.5)
    ax.set_title('Variogram: ')
    ax.set_xlabel('Lag [year]')
    ax.set_ylabel('Semivariance [$\mu$ $\pm \sigma$]')
    ax.legend(loc='lower left')
    ax.grid()
    # plt.savefig(fn_fig, dpi=600)
    plt.show()


def parse_epsg(wkt):
    return int(''.join(filter(lambda x: x.isdigit(), wkt.split(',')[-1])))


[docs]def ols_matrix(X, Y, conf_interv=0.68, conf_slope=0.68):
    """
    Ordinary Least-Squares (matrix, for optimized processing time)

    :param X: Data cube of x
    :param Y: Data cube of y
    :param conf_interv: Confidence interval for y output
    :param conf_slope: Confidence interval for WLS slope output

    :returns: Slope, Intercept, Slope CI, Y lower CB, Y upper CB
    """
    # perform independent OLS matricially for optimal processing time
    # inspired from: https://en.wikipedia.org/wiki/Simple_linear_regression#Normality_assumption
    # and https://fr.wikipedia.org/wiki/M%C3%A9thode_des_moindres_carr%C3%A9s
    x = X * 1.0
    y = Y * 1.0

    x[np.isnan(y)] = np.nan  # check for NaNs
    y[np.isnan(x)] = np.nan  # check for NaNs

    moy_X = np.nanmean(x, axis=0)
    moy_Y = np.nanmean(y, axis=0)

    mat_cross_product = (x - moy_X) * (y - moy_Y)
    sum_mat_cross_product = np.nansum(mat_cross_product, axis=0)

    mat_X_squared = (x - moy_X) ** 2
    sum_mat_X_squared = np.nansum(mat_X_squared, axis=0)

    beta1 = sum_mat_cross_product / sum_mat_X_squared
    beta0 = moy_Y - beta1 * moy_X

    # confidence interval
    alpha_interv = 1. - conf_interv
    alpha_slope = 1. - conf_slope

    Y_pred = beta1 * x + beta0
    n = np.sum(~np.isnan(x), axis=0)
    SSX = sum_mat_X_squared
    SXY = np.sqrt(np.nansum((y - Y_pred) ** 2, axis=0) / (n - 2))
    SE_slope = SXY / np.sqrt(SSX)
    hi = 1. / n + (x - moy_X) ** 2 / SSX

    # quantile of student's t distribution for p=1-alpha/2
    q_interv = stats.t.ppf(1. - alpha_interv / 2, n - 2)
    q_slope = stats.t.ppf(1. - alpha_slope / 2, n - 2)

    # upper and lower CI:
    dy = q_interv * SXY * np.sqrt(hi)
    Yl = Y_pred - dy
    Yu = Y_pred + dy

    # incert on slope
    incert_slope = q_slope * SE_slope

    return beta1, beta0, incert_slope, Yl, Yu


[docs]def wls_matrix(x, y, w, conf_interv=0.68, conf_slope=0.68):
    """
    Weighted Least-Squares (matrix, for optimized processing time)

    :param x: Data cube of x
    :param y: Data cube of y
    :param w: Data cube of weights
    :param conf_interv: Confidence interval for y output
    :param conf_slope: Confidence interval for WLS slope output

    :returns: Slope, Intercept, Slope CI, Y lower CB, Y upper CB
    """

    X = x * 1.0
    Y = y * 1.0
    W = w * 1.0

    Y[np.isnan(W) | np.isnan(X)] = np.nan  # check for NaNs
    X[np.isnan(W) | np.isnan(Y)] = np.nan  # check for NaNs
    W[np.isnan(Y) | np.isnan(X)] = np.nan  # check for NaNs

    sum_w = np.nansum(W, axis=0)
    moy_X_w = np.nansum(X * W, axis=0) / sum_w
    moy_Y_w = np.nansum(Y * W, axis=0) / sum_w

    mat_cross_product = W * (X - moy_X_w) * (Y - moy_Y_w)
    sum_mat_cross_product = np.nansum(mat_cross_product, axis=0)

    mat_X_squared = W * (X - moy_X_w) ** 2
    sum_mat_X_squared = np.nansum(mat_X_squared, axis=0)

    beta1 = sum_mat_cross_product / sum_mat_X_squared
    beta0 = moy_Y_w - beta1 * moy_X_w

    # confidence interval
    alpha_interv = 1. - conf_interv
    alpha_slope = 1. - conf_slope

    Y_pred = beta1 * X + beta0
    n = np.sum(~np.isnan(X), axis=0)
    SSX = sum_mat_X_squared
    SXY = np.sqrt(np.nansum(W * (Y - Y_pred) ** 2, axis=0) / (n - 2))
    SE_slope = SXY / np.sqrt(SSX)
    hi = 1. / n + W * (X - moy_X_w) ** 2 / SSX

    # quantile of student's t distribution for p=1-alpha/2
    q_interv = stats.t.ppf(1. - alpha_interv / 2, n - 2)
    q_slope = stats.t.ppf(1. - alpha_slope / 2, n - 2)

    # get the upper and lower CI:
    dy = q_interv * SXY * np.sqrt(hi)
    Yl = Y_pred - dy
    Yu = Y_pred + dy

    # calculate incert on slope
    incert_slope = q_slope * SE_slope

    return beta1, beta0, incert_slope, Yl, Yu


def interp_data(t, y, sig, interp_t):
    y_ = interp1d(t, y)
    s_ = interp1d(t, sig)
    return y_(interp_t), s_(interp_t)


def robust_wls(t_vals, data, ferr):
    n_out = 1
    while n_out > 0:
        beta1, beta0, incert_slope, _, _ = wls_matrix(t_vals, data, 1. / ferr,
                                                      conf_slope=0.99)
        trend = beta1 * t_vals + beta0
        std_nmad_rat = np.std(data - trend) / nmad(data - trend)
        if std_nmad_rat > 20:
            isin = np.abs(data - trend) < 4 * nmad(data - trend)
        else:
            isin = np.abs(data - trend) < 4 * np.std(data - trend)
        n_out = np.count_nonzero(~isin)

        data = data[isin]
        t_vals = t_vals[isin]
        ferr = ferr[isin]

    return beta1, incert_slope


def detrend(t_vals, data, ferr):
    n_out = 1
    while n_out > 0:
        reg = LinearRegression().fit(t_vals.reshape(-1, 1), data.reshape(-1, 1))

        trend = reg.predict(t_vals.reshape(-1, 1)).squeeze()
        std_nmad_rat = np.std(data - trend) / nmad(data - trend)
        if std_nmad_rat > 20:
            isin = np.abs(data - trend) < 4 * nmad(data - trend)
        else:
            isin = np.abs(data - trend) < 4 * np.std(data - trend)
        n_out = np.count_nonzero(~isin)

        data = data[isin]
        t_vals = t_vals[isin]
        ferr = ferr[isin]

    return reg


[docs]def gpr(data, t_vals, uncert, t_pred, opt=False, kernel=None, not_stable=True, detrend_ls=False, loop_detrend=False):
    """
    Gaussian Process regression wrapper

    :param data: Data cube of elevations
    :param t_vals: Time vector of data cube
    :param uncert: Data cube of errors
    :param t_pred: Time vector to predict
    :param opt: Boolean, run kernel optimization
    :param kernel: Provide kernel object
    :param not_stable: Mask of unstable terrain (to apply different kernels)
    :param detrend_ls: Boolean, remove linear trend before applying GP
    :param loop_detrend: Boolean, remove linear trend at each iteration before applying GP

    :returns: Data cube of fitted elevations, Data cube of propagated 1-sigma elevation errors, Data cube mask of valid data
    """

    # if only 0 or 1 elevation values in the pixel, no fitting
    if np.count_nonzero(np.isfinite(data)) < 2:
        return np.nan * np.zeros(t_pred.shape), np.nan * np.zeros(t_pred.shape), np.nan * np.zeros(data.shape)

    data_vals = data[np.isfinite(data)]
    err_vals = uncert[np.isfinite(data)] ** 2
    time_vals = t_vals[np.isfinite(data)]

    # by default, no optimizer: applying GPR with defined kernels
    if opt:
        optimizer = 'fmin_l_bfgs_b'
        n_restarts_optimizer = 9
    else:
        optimizer = None
        n_restarts_optimizer = 0

    # initializing
    n_out = 1
    final_fit = 0
    niter = 0
    tag_detr = 1

    num_finite = data_vals.size
    good_vals = np.isfinite(data_vals)
    max_z_score = [20, 12, 9, 6, 4]

    while (n_out > 0 or final_fit == 0) and num_finite >= 2:

        # default kernels
        if kernel is None:

            # weighted least squares
            beta1, beta0, incert_slope, _, _ = wls_matrix(time_vals[good_vals], data_vals[good_vals],
                                                          1. / err_vals[good_vals], conf_slope=0.99)

            # standardized dispersion from linearity (standardized RMSE)
            if ~np.isnan(beta1) and ~np.isnan(beta0):
                res_stdized = np.sqrt(np.mean(
                    (data_vals[good_vals] - (beta0 + beta1 * time_vals[good_vals])) ** 2 / err_vals[good_vals]))
                res = np.sqrt(np.mean((data_vals[good_vals] - (beta0 + beta1 * time_vals[good_vals])) ** 2))
            else:
                res_stdized = 1
                res = np.sqrt(50.)

            # split two periods to try to detect and not filter out surges
            ind_first = np.logical_and(good_vals, time_vals < 10.)
            ind_last = np.logical_and(good_vals, time_vals >= 10.)

            if final_fit == 0:
                nonlin_var = 0
                period_nonlinear = 20
                # first iteration, let it filter out very large outliers
                if niter == 0:
                    base_var = 100.
                    # adapt variance to try not to filter out surges before the final fit with local linear kernel
                elif np.count_nonzero(ind_first) >= 5 and np.count_nonzero(ind_last) >= 5:
                    diff = np.abs(np.mean(data_vals[ind_first]) - np.mean(data_vals[ind_last]))
                    diff_std = np.sqrt(np.std(data_vals[ind_first]) ** 2 + np.std(data_vals[ind_last]) ** 2)
                    if diff - diff_std > 0:
                        base_var = 50 + (diff - diff_std) ** 2 / 2
                    else:
                        base_var = 50.
                else:
                    base_var = 50.
            else:
                # final fit
                base_var = 50.
                if res_stdized != 0:
                    nonlin_var = np.mean(err_vals[good_vals]) + (res / res_stdized) ** 2
                    period_nonlinear = min(100., 100. / res_stdized ** 2)
                else:
                    nonlin_var = np.mean(err_vals[good_vals])
                    period_nonlinear = 100.

            # linear kernel + periodic kernel + local kernel
            k1 = PairwiseKernel(1, metric='linear')  # linear kernel
            k2 = C(30) * ESS(length_scale=1, periodicity=1)  # periodic kernel
            # k3 =  #local kernel
            # k3 = C(50) * RBF(1)
            k3 = C(base_var * 0.6) * RBF(0.75) + C(base_var * 0.3) * RBF(1.5) + C(base_var * 0.1) * RBF(3)
            k4 = PairwiseKernel(1, metric='linear') * C(nonlin_var) * RQ(period_nonlinear, 10)
            kern = k1 + k2
            if not_stable:
                # k3 =  #non-linear kernel
                kern += k3 + k4
        else:
            kern = kernel

        # here we need to change the 0 for the x axis, in case we are using a linear kernel
        mu_x = np.nanmean(time_vals[good_vals])
        detr_t_pred = t_pred - mu_x
        detr_time_vals = time_vals - mu_x
        mu_y = np.nanmean(data_vals)

        if detrend_ls:
            # first, remove a linear trend
            if tag_detr != 0:
                try:
                    # try to remove a linear fit from the data before we fit, then add it back in when we're done.
                    reg = detrend(detr_time_vals[good_vals], data_vals[good_vals], err_vals[good_vals])
                except:
                    return np.nan * np.zeros(t_pred.shape), np.nan * np.zeros(t_pred.shape), np.nan * np.zeros(
                        data.shape)

                l_trend = reg.predict(detr_time_vals.reshape(-1, 1)).squeeze()
                if not loop_detrend:
                    tag_detr = 0

            detr_data_vals = data_vals - l_trend
        else:
            # the mean has to be 0 to do gpr, even if we don't detrend
            detr_data_vals = data_vals - mu_y

        # if we remove a linear trend, normalize_y should be false...
        gp = GaussianProcessRegressor(kernel=kern, optimizer=optimizer, n_restarts_optimizer=n_restarts_optimizer,
                                      alpha=err_vals[good_vals], normalize_y=False)
        gp.fit(detr_time_vals[good_vals].reshape(-1, 1), detr_data_vals[good_vals].reshape(-1, 1))
        y_pred, sigma = gp.predict(detr_t_pred.reshape(-1, 1), return_std=True)
        y_, s_ = interp_data(detr_t_pred, y_pred.squeeze(), sigma.squeeze(), detr_time_vals)
        z_score = np.abs(detr_data_vals - y_) / s_

        isin = z_score[np.isfinite(z_score)] < 4
        # we continue the loop if there is a least one value outside 4 stds
        n_out = np.count_nonzero(~isin)

        # good elevation values can also be outside 4stds because of bias in the first fits
        # thus, if needed, we remove outliers packet by packet, starting with the largest ones
        isout = z_score[np.isfinite(z_score)] > max_z_score[min(niter, len(max_z_score) - 1)]
        tmp_data_vals = data_vals[np.isfinite(z_score)]
        tmp_data_vals[isout] = np.nan
        data_vals[np.isfinite(z_score)] = tmp_data_vals

        good_vals = np.isfinite(data_vals)
        num_finite = np.count_nonzero(good_vals)
        niter += 1

        # no need to filter outliers for final fit
        if final_fit == 1:
            n_out = 0
        # if we have no outliers outside 4 std, initialize back values to jump directly to the final fitting step
        if n_out == 0 and final_fit == 0 and niter > 1:
            n_out = 1
            final_fit = 1
            niter = len(max_z_score) - 1

    # if there is not enough data left...
    if num_finite < 2:
        y_pred = np.nan * np.zeros(t_pred.shape)
        sigma = np.nan * np.zeros(t_pred.shape)

    if detrend_ls:
        l_pred = reg.predict(t_pred.reshape(-1, 1)).squeeze()
        y_out = y_pred.squeeze() + l_pred
    else:
        y_out = y_pred.squeeze() + mu_y

    filt_data = data
    filt_data[np.isfinite(data)] = data_vals

    return y_out, sigma.squeeze(), np.isfinite(filt_data)


[docs]def ls(subarr, t_vals, err, weigh, filt_ls=False, conf_filt=0.99):
    """
    Ordinary or Weighted Least-Squares wrapper

    :param subarr: Data cube of elevations
    :param t_vals: Time vector of data cube
    :param err: Data cube of errors
    :param weigh: Boolean, use weighted least-squares
    :param filt_ls: Boolean, use a first CI filtering before fitting again
    :param conf_filt: Confidence interval of the filtering fit

    :returns: Raster concatenation (LS slope, LS intercept, slope 1-sigma error, number of samples, date of first sample, date of last sample), Data cube mask of valid data
    """

    T, Y, X = subarr.shape

    # converting time values
    y0 = t_vals[0].astype('datetime64[D]').astype(object).year
    y1 = t_vals[-1].astype('datetime64[D]').astype(object).year + 1.1
    total_delta = np.datetime64('{}-01-01'.format(int(y1))) - np.datetime64('{}-01-01'.format(int(y0)))
    ftime_delta = np.array([t - np.datetime64('{}-01-01'.format(int(y0))) for t in t_vals])
    time_vals = y0 + (ftime_delta / total_delta) * (int(y1) - y0)

    z_mat = subarr.reshape(T, Y * X)
    t_mat = np.array([time_vals, ] * Y * X).T
    w_mat = err.reshape(T, Y * X)
    # old error
    # if weigh:
    #     w_mat = np.array([1. / err ** 2, ] * Y * X).T

    if filt_ls:
        if weigh:
            yl, yu = wls_matrix(t_mat, z_mat, w_mat, conf_interv=conf_filt)[3:5]
        else:
            yl, yu = ols_matrix(t_mat, z_mat, conf_interv=conf_filt)[3:5]

        z_mat[z_mat < yl] = np.nan
        z_mat[z_mat > yu] = np.nan

    if weigh:
        beta1, beta0, incert_slope = wls_matrix(t_mat, z_mat, w_mat)[0:3]
    else:
        beta1, beta0, incert_slope = ols_matrix(t_mat, z_mat)[0:3]

    date_min = np.nanmin(t_mat, axis=0)
    date_max = np.nanmax(t_mat, axis=0)
    nb_trend = (~np.isnan(z_mat)).sum(axis=0)

    filter_less_2_DEMs = nb_trend <= 2
    beta1[filter_less_2_DEMs] = np.nan
    incert_slope[filter_less_2_DEMs] = np.nan

    slope = np.reshape(beta1, (Y, X))
    interc = np.reshape(beta0, (Y, X))
    slope_sig = np.reshape(incert_slope, (Y, X))
    nb_dem = np.reshape(nb_trend, (Y, X))
    date_min = np.reshape(date_min, (Y, X))
    date_max = np.reshape(date_max, (Y, X))

    filt_subarr = np.isfinite(z_mat.reshape(T, Y, X))

    outarr = np.stack((slope, interc, slope_sig, nb_dem, date_min, date_max), axis=0)

    return outarr, filt_subarr


# @jit
def gpr_wrapper(argsin):
    z, i, t_vals, err, new_t, opt, kernel, uns_arr = argsin
    start = time.time()

    Y, X = z.shape[1:3]
    outarr = np.nan * np.zeros((new_t.size * 2 + z.shape[0], Y, X))
    # pixel by pixel
    for x in range(X):
        for y in range(Y):
            if uns_arr is not None:
                uns_tag = uns_arr[0, y, x]
            else:
                uns_tag = True
            uncert = err[:, y, x]
            tmp_y, tmp_sig, tmp_filt = gpr(z[:, y, x], t_vals, uncert, new_t, opt=opt, not_stable=uns_tag,
                                           kernel=kernel)[0:3]
            out = np.concatenate((tmp_y, tmp_sig, tmp_filt), axis=0)
            outarr[:, y, x] = out
    elaps = time.time() - start

    print('Done with block {}, elapsed time {}.'.format(i, elaps))
    return outarr


def gpr_dask_wrapper(z, err, t_vals, new_t, opt=False, kernel=None, uns_arr=None):
    start = time.time()

    Y, X = z.shape[0:2]
    outarr = np.nan * np.zeros((Y, X, new_t.size * 2 + z.shape[2],))
    # pixel by pixel
    for x in range(X):
        for y in range(Y):
            if uns_arr is not None:
                uns_tag = uns_arr[y, x, 0]
            else:
                uns_tag = True
            uncert = err[y, x, :]
            tmp_y, tmp_sig, tmp_filt = gpr(z[y, x, :], t_vals, uncert, new_t, opt=opt, not_stable=uns_tag,
                                           kernel=kernel)[0:3]
            out = np.concatenate((tmp_y, tmp_sig, tmp_filt), axis=0)
            outarr[y, x, :] = out
    elaps = time.time() - start

    print('Done with block, elapsed time {}.'.format(elaps))
    return outarr


def gpr_dask(z, err, time_vals, new_t):
    part_fun = functools.partial(gpr_dask_wrapper, t_vals=time_vals, new_t=new_t)

    return xr.apply_ufunc(part_fun, z, err, input_core_dims=[['time'], ['time']], output_core_dims=[['new_time']],
                          output_sizes={'new_time': 2 * len(new_t) + len(time_vals)},
                          dask='parallelized', output_dtypes=[float], join='outer')


@jit
def ls_wrapper(argsin):
    subarr, i, t_vals, err, weigh, filt_ls, conf_filt = argsin
    start = time.time()
    # matrix
    outarr, filt_subarr = ls(subarr, t_vals, err, weigh, filt_ls=filt_ls, conf_filt=conf_filt)
    elaps = time.time() - start
    print('Done with block {}, elapsed time {}.'.format(i, elaps))
    return outarr, filt_subarr


def splitter(img, nblocks):
    # manually split a datacube for parallel computing

    split1 = np.array_split(img, nblocks[0], axis=1)
    split2 = [np.array_split(im, nblocks[1], axis=2) for im in split1]
    olist = [np.copy(a) for a in list(chain.from_iterable(split2))]
    return olist


def stitcher(outputs, nblocks):
    # manually stitch a datacube for parallel computing

    stitched = []
    if np.array(nblocks).size == 1:
        nblocks = np.array([nblocks, nblocks])
    for i in range(nblocks[0]):
        stitched.append(outputs[i * nblocks[1]])
        for j in range(1, nblocks[1]):
            outind = j + i * nblocks[1]
            stitched[i] = np.concatenate((stitched[i], outputs[outind]), axis=2)
    return np.concatenate(tuple(stitched), axis=1)


def patchify(arr, nblocks, overlap):
    # manually split with overlap a datacube for parallel computing

    overlap = int(np.floor(overlap))
    patches = []
    nx, ny = np.shape(arr)
    nx_sub = nx // nblocks[0]
    ny_sub = ny // nblocks[1]
    split = [[nx_sub * i, min(nx_sub * (i + 1), nx), ny_sub * j, min(ny_sub * (j + 1), ny)] for i in
             range(nblocks[0] + 1) for j in range(nblocks[1] + 1)]
    over = [[max(0, l[0] - overlap), min(nx, l[1] + overlap), max(0, l[2] - overlap), min(l[3] + overlap, ny)] for l in
            split]
    inv = []
    for k in range(len(split)):
        x0, x1, y0, y1 = split[k]
        i0, i1, j0, j1 = over[k]
        patches.append(arr[i0:i1, j0:j1])
        inv.append([x0 - i0, x1 - i0, y0 - j0, y1 - j0])

    return patches, inv, split


def unpatchify(arr_shape, subarr, inv, split):
    # manually stitch account for split overlap a datacube for parallel computing

    out = np.zeros(arr_shape)

    for k, arr in enumerate(subarr):
        s = split[k]
        i = inv[k]
        out[s[0]:s[1], s[2]:s[3]] = arr[i[0]:i[1], i[2]:i[3]]

    return out


[docs]def cube_to_stack(ds, out_cube, y0, nice_fit_t, outfile, slope_arr=None, ci=True, clobber=False, filt_bool=False):
    """
    Write data cube to netcdf with new time axis, confidence interval, slope, etc...
    #TODO: this could be simplified quite a bit...

    :param ds: xarray Dataset of data cube
    :param out_cube: Data cube of time series of elevations
    :param y0: Origin of time vector
    :param nice_fit_t: Time vector relative to the origin
    :param outfile: Filename of output netcdf
    :param slope_arr: Slope raster to write as concomitant DataArray
    :param ci: Boolean to look for confidence interval in the out_cube
    :param clobber: Boolean to replace output file
    :param filt_bool: Boolean to force a boolean datacube as output (e.g., to the mask data cube of the multi-step filtering)

    :return:
    """

    fit_cube = out_cube[:len(nice_fit_t), :, :]

    img_shape = np.zeros(np.shape(fit_cube)[1:3])

    nco, to, xo, yo = st.create_nc(img_shape, outfile=outfile,
                                   clobber=clobber, t0=np.datetime64('{}-01-01'.format(y0)))
    st.create_crs_variable(parse_epsg(ds['crs'].spatial_ref), nco)

    x, y = ds['x'].values, ds['y'].values
    xo[:] = x
    yo[:] = y
    to[:] = nice_fit_t

    if not filt_bool:
        dt = 'f4'
        fill = -9999
    else:
        dt = 'i1'
        fill = False
        fit_cube = np.array(fit_cube, dtype=bool)

    zo = nco.createVariable('z', dt, ('time', 'y', 'x'), fill_value=fill, zlib=True,
                            chunksizes=[500, min(150, ds.y.size), min(150, ds.x.size)])
    zo.units = 'meters'
    zo.long_name = 'Fit elevation above WGS84 ellipsoid'
    zo.grid_mapping = 'crs'
    zo.coordinates = 'x y'
    zo.set_auto_mask(True)

    zo[:] = fit_cube

    if ci:
        sig_cube = out_cube[len(nice_fit_t):, :, :]
        fzo = nco.createVariable('z_ci', 'f4', ('time', 'y', 'x'), fill_value=-9999, zlib=True,
                                 chunksizes=[500, min(150, ds.y.size), min(150, ds.x.size)])
        fzo.units = 'meters'
        fzo.long_name = '68% confidence interval for elevation fit.'
        fzo.grid_mapping = 'crs'
        fzo.coordinates = 'x y'
        fzo.set_auto_mask(True)

        fzo[:] = sig_cube

    if slope_arr is not None:
        so = nco.createVariable('slope', 'f4', ('y', 'x'), fill_value=-9999, zlib=True,
                                chunksizes=[min(150, ds.y.size), min(150, ds.x.size)])
        so.units = 'degrees'
        so.long_name = 'median slope used to condition elevation uncertainties'
        so.grid_mapping = 'crs'
        so.coordinates = 'x y'

        so[:] = slope_arr

    nco.close()


def arr_to_img(ds, out_arr, outfile):
    outfile_slope = os.path.join(os.path.dirname(outfile),
                                 os.path.splitext(os.path.basename(outfile))[0] + '_dh.tif')
    outfile_interc = os.path.join(os.path.dirname(outfile),
                                  os.path.splitext(os.path.basename(outfile))[0] + '_interc.tif')
    outfile_sig = os.path.join(os.path.dirname(outfile),
                               os.path.splitext(os.path.basename(outfile))[0] + '_err.tif')
    outfile_nb = os.path.join(os.path.dirname(outfile),
                              os.path.splitext(os.path.basename(outfile))[0] + '_nb.tif')
    outfile_dmin = os.path.join(os.path.dirname(outfile),
                                os.path.splitext(os.path.basename(outfile))[0] + '_dmin.tif')
    outfile_dmax = os.path.join(os.path.dirname(outfile),
                                os.path.splitext(os.path.basename(outfile))[0] + '_dmax.tif')

    arr = st.make_geoimg(ds, band=0)
    arr.img = out_arr[0, :, :]
    arr.write(outfile_slope)
    arr.img = out_arr[1, :, :]
    arr.write(outfile_interc)
    arr.img = out_arr[2, :, :]
    arr.write(outfile_sig)
    arr.img = out_arr[3, :, :]
    arr.write(outfile_nb)
    arr.img = out_arr[4, :, :]
    arr.write(outfile_dmin)
    arr.img = out_arr[5, :, :]
    arr.write(outfile_dmax)


# TODO: homogenize how REF DEM is provided: sometimes filename, sometimes raster, sometimes GeoImg... inadmissible :)

[docs]def time_filter_ref(ds_arr, ref_arr, t_vals, ref_date, max_dhdt=[-50, 50], base_thresh=100.):
    """
    Temporal filtering of stacked DEMs with a reference DEM: removes all elevations outside a positive/negative linear
     elevation trend from the reference, with a base threshold.

    :param ds_arr: Data cube of elevations
    :param ref_arr: Raster of reference DEM
    :param t_vals: Time vector of  data cube
    :param ref_date: Date of reference DEM
    :param max_dhdt: Maximum positive/negative elevation change rate per year from reference elevations [-50 m,50 m]
    :param base_thresh: Base vertical threshold for the temporal filter (avoid elevations close to reference) [100 m]

    :return: ds_arr: Filtered data cube
    """

    print('Adding base threshold of ' + str(base_thresh) + ' m around reference values.')

    delta_t = (ref_date - t_vals).astype('timedelta64[D]').astype(float) / 365.24
    dh = ref_arr[None, :, :] - ds_arr
    dt_arr = np.ones(dh.shape) * delta_t[:, None, None]
    # dh = ref_arr[:,:,None] - z_arr
    # dt_arr = np.ones(dh.shape) * delta_t[None,None,:]
    if np.array(max_dhdt).size == 1:
        ds_arr[np.abs(dh) > base_thresh + np.abs(dt_arr) * max_dhdt] = np.nan
    else:
        ds_arr[np.logical_or(np.logical_and(dt_arr < 0, np.logical_or(dh < - base_thresh + dt_arr * max_dhdt[1],
                                                                      dh > base_thresh + dt_arr * max_dhdt[0])),
                             np.logical_and(dt_arr > 0, np.logical_or(dh > base_thresh + dt_arr * max_dhdt[1],
                                                                      dh < - base_thresh + dt_arr * max_dhdt[
                                                                          0])))] = np.nan
    return ds_arr


[docs]def dask_time_filter_ref(ds, ds_arr, ref_dem, t_vals, ref_date, max_dhdt=[-50, 50], base_thresh=100., nproc=1):
    """
       Dask wrapper of temporal filtering of stacked DEMs with a reference DEM: removes all elevations outside a
       positive/negative linear elevation trend from the reference, with a base threshold.

       :param ds_arr: Data cube of elevations
       :param ds_arr: Data cube of elevations
       :param ref_dem: GeoImg of reference DEM
       :param t_vals: Time vector of  data cube
       :param ref_date: Date of reference DEM
       :param max_dhdt: Maximum positive/negative elevation change rate per year from reference elevations [-50 m,50 m]
       :param base_thresh: Base vertical threshold for the temporal filter (avoid elevations close to reference) [100 m]
       :param nproc: Number of cores for multiprocessing [1]

       :return: Filtered data cube
       """

    start = time.time()
    print('Setting up time filtering parallelized...')
    part_tf = functools.partial(time_filter_ref, t_vals=t_vals, ref_date=ref_date, max_dhdt=max_dhdt,
                                base_thresh=base_thresh)

    z_dask = xr.DataArray(ds_arr, coords=[ds.time.values, ds.y.values, ds.x.values], dims=['time', 'y', 'x'])
    ref_arr = ref_dem.img
    ref_dask = xr.DataArray(ref_arr, coords=[ds.y.values, ds.x.values], dims=['y', 'x'])

    print('Time filter elapsed time is ' + str(time.time() - start))
    print('Chunking...')
    z_chunk = z_dask.chunk({'x': 200, 'y': 200})
    ref_chunk = ref_dask.chunk({'x': 200, 'y': 200})

    print('Time filter elapsed time is ' + str(time.time() - start))
    print('Chunking done, computing.')

    sl = xr.apply_ufunc(part_tf, z_chunk, ref_chunk, input_core_dims=[['time'], []], output_core_dims=[['time']],
                        dask='parallelized', output_dtypes=[float])
    filt_z_dask = sl.compute(num_workers=nproc, scheduler='processes')
    filt_z_dask = filt_z_dask.transpose('time', 'y', 'x')

    print('Time filter elapsed time is ' + str(time.time() - start))

    return filt_z_dask.values


@jit_filter_function
def nanmax(a):
    return np.nanmax(a)


@jit_filter_function
def nanmin(a):
    return np.nanmin(a)


def wrapper_slope(argsin):
    arr, in_met = argsin

    start = time.time()
    slope_list = []

    # create input image
    gt, proj, npix_x, npix_y = in_met
    drv = gdal.GetDriverByName('MEM')
    dst = drv.Create('', npix_x, npix_y, 1, gdal.GDT_Float32)
    sp = dst.SetProjection(proj)
    sg = dst.SetGeoTransform(gt)
    for i in range(np.shape(arr)[0]):
        out_arr = np.copy(arr[i, :])
        out_arr[np.isnan(out_arr)] = -9999
        wa = dst.GetRasterBand(1).WriteArray(out_arr)
        md = dst.SetMetadata({'Area_or_point': 'Point'})
        nd = dst.GetRasterBand(1).SetNoDataValue(-9999)
        tmp_z = GeoImg(dst)

        slope = get_slope(tmp_z)

        slope_list.append(slope.img)
    del sp, sg, wa, md, nd

    slope_stack = np.stack(slope_list, axis=0)

    print('Deriving slope stack in ' + str(time.time() - start))

    return slope_stack


def maxmin_disk_filter(argsin):
    arr, rad = argsin

    max_arr = filters.generic_filter(arr, nanmax, footprint=disk(rad))
    min_arr = filters.generic_filter(arr, nanmin, footprint=disk(rad))

    return max_arr, min_arr


@jit
def robust_nanmax(a):
    return np.nanpercentile(a, 80)
    # return np.nanmax(a[np.abs(np.nanmedian(a)-a)<3*nmad(a)])


@jit
def robust_nanmin(a):
    # return np.nanmin(a[np.abs(np.nanmedian(a)-a)<3*nmad(a)])
    return np.nanpercentile(a, 20)


def robust_maxmin_disk_filter(argsin):
    arr, rad = argsin

    max_arr = filters.generic_filter(arr, robust_nanmax, footprint=disk(rad))
    min_arr = filters.generic_filter(arr, robust_nanmin, footprint=disk(rad))

    return max_arr, min_arr


[docs]def spat_filter_ref(ds_arr, ref_dem, cutoff_kern_size=500, cutoff_thr=20., nproc=1):
    """
    Spatial filtering of stacked DEMs with a reference DEM: removes all elevations smaller than the minimum or larger
    than the maximum reference elevation found within a circle of certain size, with a base threshold.

    :param ds_arr: Data cube of elevations
    :param ref_dem: GeoImg of reference DEM
    :param cutoff_kern_size: Radius size for kernel spatial filtering [500 m]
    :param cutoff_thr: Base vertical threshold for the spatial filter (avoid elevations close to reference)

    :return: ds_arr: Filtered data cube
    """

    # here we assume that the reference DEM is a "clean" post-processed DEM, filtered with QA for low confidence outliers
    # minimum/maximum elevation in circular surroundings based on reference DEM

    ref_arr = ref_dem.img
    res = 100.
    rad = int(np.floor(cutoff_kern_size / res))

    if nproc == 1:
        print('Filtering min/max in radius of ' + str(cutoff_kern_size) + 'm, base threshold of ' + str(
            cutoff_thr) + 'm on 1 proc...')
        max_arr, min_arr = maxmin_disk_filter((ref_arr, rad))
    else:
        print('Filtering min/max in radius of ' + str(cutoff_kern_size) + 'm, base threshold of ' + str(
            cutoff_thr) + 'm on ' + str(nproc) + ' procs...')
        nopt = int(np.floor(np.sqrt(nproc)))
        nblocks = [nopt, nopt]
        patches, inv, split = patchify(ref_arr, nblocks, rad)

        pool = mp.Pool(nproc, maxtasksperchild=1)
        argsin = [(p, rad) for i, p in enumerate(patches)]
        outputs = pool.map(maxmin_disk_filter, argsin, chunksize=1)
        pool.close()
        pool.join()

        zip_out = list(zip(*outputs))

        max_arr = unpatchify(np.shape(ref_arr), zip_out[0], inv, split)
        min_arr = unpatchify(np.shape(ref_arr), zip_out[1], inv, split)

    for i in range(ds_arr.shape[0]):
        ds_arr[i, np.logical_or(ds_arr[i, :] > (max_arr + cutoff_thr), ds_arr[i, :] < (min_arr - cutoff_thr))] = np.nan

    return ds_arr


def nanmedian_slope(slope_cube):
    return np.nanmedian(slope_cube, axis=2)


[docs]def isel_merge_dupl_dates(ds):
    """
    Merge duplicate dates of a xarray dataset by taking the mean of similar dates (e.g., overlapping same-date ASTER DEMs)

    :param ds: xarray Dataset of datacube
    :return: ds: merged Dataset
    """

    # merge DEMs elevations (np.nanmean) for similar dates
    t_vals = ds.time.values
    dates_rm_dupli = sorted(list(set(list(t_vals))))
    ind_firstdate = []
    for i, date in enumerate(dates_rm_dupli):
        ind_firstdate.append(list(t_vals).index(date))
    ds_filt = ds.isel(time=np.array(ind_firstdate))
    for i in range(len(dates_rm_dupli)):
        t_ind = (t_vals == dates_rm_dupli[i])
        if len(t_ind) > 1:
            ds_filt.z.values[i, :] = np.nanmean(ds.z.values[t_ind, :], axis=0)
            ds_filt.uncert.values[i, :] = np.nanmean(ds.uncert.values[t_ind, :])

            # something is wrong when doing weighted mean...

            # careful, np.nansum gives back zero for an axis full of NaNs
            # mask_nan = np.all(np.isnan(ds.z.values[t_ind,:]),axis=0)
            # ds_filt.z.values[i, :] = np.nansum(ds.z.values[t_ind, :] * 1./ds.uncert.values[t_ind,None,None]**2,
            # axis=0)/ np.nansum(1./ds.uncert.values[t_ind,None,None]**2, axis=0)
            # ds_filt.z.values[i, mask_nan] = np.nan
            # ds_filt.uncert.values[i] = np.nansum(ds.uncert.values[t_ind] * 1./ds.uncert.values[t_ind]**2) /
            # np.nansum(1./ds.uncert.values[t_ind]**2)

    ds = ds_filt

    return ds


[docs]def isel_maskout(ds, inc_mask):
    """
    Mask out values out of shapefile and minimize spatial extent, remove void time steps (to decrease computing time)

    :param ds: xarray Dataset of datacube
    :return: ds: reduced Dataset
    """

    # simplify extent to mask, mask out remaining masked pixels in the extent as NaNs, remove unusued time indexes
    land_mask = get_stack_mask(inc_mask, ds)
    if np.count_nonzero(land_mask) > 0:
        ds, submask, slices = tt.sel_dc(ds, None, land_mask)
        print('Including only ' + str(np.count_nonzero(land_mask)) + ' pixels out of ' + str(
            np.shape(land_mask)[0] * np.shape(land_mask)[1]) + ' on this tile')
        ds_arr = ds.z.values
        ds_arr[:, ~submask] = np.nan
        print('Consequently removing void DEMs...')
        non_void = np.count_nonzero(np.isfinite(ds_arr), axis=(1, 2)) > 0
        ds = ds.isel(time=non_void)
    else:
        ds = None

    return ds


[docs]def constrain_var_slope_corr(ds, ds_arr, ds_corr, t_vals, uncert, fn_ref_dem=None, nproc=1):
    """
    Given a data cube of stacked DEMs, a data cube of stereo-correlations and a per-DEM vector of co-registration
    RMSE: estimate a median slope and compute a data cube of elevation errors.
    (error dependance on the slope and stereo-correlation is defined independently)
    #TODO: allow for a user-defined function of error?

    :param ds: xarray Dataset of datacube
    :param ds_arr: Data cube of elevations
    :param ds_corr: Data cube of stereo-correlations
    :param t_vals: Time vector of data cube
    :param uncert: Vector of co-registration RMSEs of the stacked DEMs
    :param fn_ref_dem: Filename for input reference DEM
    :param nproc: Number of cores for multiprocessing [1]

    :return: Data cube of errors, Raster of median slope values
    """

    print('>>Starting variance assessment...')

    start_var = time.time()

    if nproc == 1:
        print('Estimating terrain slope to constrain uncertainties with 1 proc...')
        slope_arr = np.zeros(np.shape(ds_arr))
        for i in range(len(t_vals)):
            slope = get_slope(st.make_geoimg(ds, i))
            slope.img[slope.img > 70] = np.nan
            slope_arr[i, :, :] = slope.img
    else:
        print('Estimating terrain slope to constrain uncertainties with ' + str(nproc) + ' procs...')
        tmp_img = st.make_geoimg(ds)
        pack_size = int(np.ceil(ds.time.size / nproc))
        in_met = (tmp_img.gt, tmp_img.proj_wkt, tmp_img.npix_x, tmp_img.npix_y)
        argsin_z = [(ds_arr[i:min(i + pack_size, ds.time.size), :], in_met) for k, i in
                    enumerate(np.arange(0, ds.time.size, pack_size))]
        pool = mp.Pool(nproc, maxtasksperchild=1)
        outputs_z = pool.map(wrapper_slope, argsin_z)
        pool.close()
        pool.join()

        print('Elapsed time during variance assess. is ' + str(time.time() - start_var))
        slope_arr = np.zeros(np.shape(ds_arr))
        for k, i in enumerate(np.arange(0, ds.time.size, pack_size)):
            slope_arr[i:min(i + pack_size, ds.time.size), :] = outputs_z[k]

    # err_dask = xr.DataArray(err_arr, coords=[ds.time.values, ds.y.values, ds.x.values], dims=['time', 'y', 'x'])
    #
    # err_dask = err_dask.chunk({'x': 100, 'y': 100})
    # sl = xr.apply_ufunc(nanmedian_slope, err_dask, input_core_dims=[['time']], dask='parallelized',
    #                     output_dtypes=[float])
    # med_dask = sl.compute(num_workers=nproc, scheduler='processes')
    # med_slope = med_dask.values
    med_slope = np.nanmedian(slope_arr, axis=0)

    if fn_ref_dem is not None:
        tmp_dem = GeoImg(fn_ref_dem)
        slope_all = get_slope(tmp_dem)
        slope_ref = slope_all.reproject(st.make_geoimg(ds))
        med_slope[np.isnan(med_slope)] = slope_ref.img[np.isnan(med_slope)]

    err_arr = np.ones(np.shape(ds_arr), dtype=np.float32)
    err_arr = err_arr * uncert[:, None, None] ** 2

    # based on prior analysis of variance, this is a pretty good generic fit
    slope_err = ((20 + 20 * (100 - ds_corr)) * np.tan(med_slope * np.pi / 180)) ** 2
    med_slope_arr = med_slope[None, :, :] * np.ones(np.shape(ds_arr)[0])[:, None, None]
    slope_err[med_slope_arr > 50] += ((med_slope_arr[med_slope_arr > 50] - 50) * 5) ** 2
    corr_err = (((100 - ds_corr) / 100) * 20) ** 2.5
    err_arr += slope_err
    err_arr += corr_err
    err_arr = np.sqrt(err_arr)
    err_arr[np.logical_or(~np.isfinite(err_arr), np.abs(err_arr) > 300)] = 300

    print('Elapsed time during variance assess. is ' + str(time.time() - start_var))

    return err_arr, med_slope


[docs]def prefilter_stack(ds, ds_arr, fn_ref_dem, t_vals, filt_ref='min_max', ref_dem_date=None, max_dhdt=[-50, 50], nproc=1):
    """
    Given a data cube of stacked DEMs and a reference DEM: condition a first-order spatial and temporal
    filtering of outliers.

    :param ds: xarray Dataset of datacube
    :param ds_arr: Data cube of elevations
    :param t_vals: Time vector of data cube
    :param filt_ref: Method of filtering: spatial "min_max", temporal "time" or "both" ["min_max"]
    :param fn_ref_dem: Filename for input reference DEM
    :param ref_dem_date: Date of reference DEM
    :param max_dhdt: Maximum positive/negative elevation change rate per year from reference elevations [-50 m,50 m]
    :param nproc: Number of cores for multiprocessing [1]

    :return: Filtered data cube
    """

    print('>>Starting prefiltering...')
    start_prefilt = time.time()

    print('Number of valid pixels:' + str(np.count_nonzero(~np.isnan(ds_arr))))

    # minimum/maximum elevation on Earth
    ds_arr[np.logical_or(ds_arr < -400, ds_arr > 8900)] = np.nan

    tmp_geo = st.make_geoimg(ds)
    tmp_dem = GeoImg(fn_ref_dem)
    ref_dem = tmp_dem.reproject(tmp_geo)
    if filt_ref == 'min_max':
        print('Filtering spatially using min/max values in {}'.format(fn_ref_dem))
        ds_arr = spat_filter_ref(ds_arr, ref_dem, nproc=nproc)
    elif filt_ref == 'time':
        if ref_dem_date is None:
            print('Reference DEM time stamp not specified, defaulting to 01.01.2000')
            ref_dem_date = np.datetime64('2000-01-01')
        print('Filtering temporally with threshold of {} m/a'.format(max_dhdt))
        # ds_arr =  dask_time_filter_ref(ds, ds_arr, ref_dem, t_vals, ref_dem_date, dhdt_thresh=time_filt_thresh, nproc=nproc)
        ds_arr = time_filter_ref(ds_arr, ref_dem, t_vals, ref_dem_date, max_dhdt=max_dhdt)
    elif filt_ref == 'both':
        print('Filtering spatially using min/max values in {}'.format(fn_ref_dem))
        ds_arr = spat_filter_ref(ds_arr, ref_dem, cutoff_kern_size=200, cutoff_thr=700., nproc=nproc)
        ds_arr = spat_filter_ref(ds_arr, ref_dem, cutoff_kern_size=500, cutoff_thr=500., nproc=nproc)
        ds_arr = spat_filter_ref(ds_arr, ref_dem, cutoff_kern_size=2000, cutoff_thr=300.,
                                 nproc=int(np.floor(nproc / 4)))
        print('Number of valid pixels after spatial filtering:' + str(np.count_nonzero(~np.isnan(ds_arr))))

        print('Elapsed time during prefiltering is ' + str(time.time() - start_prefilt))
        if ref_dem_date is None:
            print('Reference DEM time stamp not specified, defaulting to 01.01.2000')
            ref_dem_date = np.datetime64('2000-01-01')
        print('Filtering temporally with threshold of {} m/a'.format(max_dhdt))
        # ds_arr =  dask_time_filter_ref(ds, ds_arr, ref_dem, t_vals, ref_dem_date, dhdt_thresh=time_filt_thresh, nproc=nproc)
        ds_arr = time_filter_ref(ds_arr, ref_dem.img, t_vals, ref_dem_date, max_dhdt=max_dhdt)
        print('Number of valid pixels after temporal filtering:' + str(np.count_nonzero(~np.isnan(ds_arr))))

        print('Elapsed time during prefiltering is ' + str(time.time() - start_prefilt))

    return ds_arr


[docs]def robust_wls_ref_filter_stack(ds, ds_arr, err_arr, t_vals, fn_ref_dem, ref_dem_date=np.datetime64('2013-01-01'),
                                max_dhdt=[-50, 50], nproc=1, cutoff_kern_size=1000, max_deltat_ref=2.,
                                base_thresh=100.):
    """
    Given a data cube of stacked DEMs, of elevation errors and a reference DEM: condition a spatial and temporal
    filtering of outliers based on WLS linear trends.

    :param ds: xarray Dataset of data cube
    :param ds_arr: Data cube of elevations
    :param err_arr: Data cube of elevation errors
    :param t_vals: Time vector of data cube
    :param fn_ref_dem: Filename for input reference DEM
    :param ref_dem_date: Date of reference DEM
    :param max_dhdt: Maximum positive/negative elevation change rate per year from reference elevations [-50 m,50 m]
    :param nproc: Number of cores for multiprocessing [1]
    :param cutoff_kern_size: Radius size for kernel spatial filtering [1000 m]
    :param max_deltat_ref: Maximum time lag between acquisition of reference DEM and date provided [2 yr]
    :param base_thresh: Base vertical threshold for the temporal filter (avoid elevations close to reference) [100 m]

    :return: Filtered data cube
    """

    # TODO: remove ds from the list of inputs, and provide ref_dem_arr instead of fn_ref_dem?

    print('Performing WLS to condition filtering...')

    # wls parameters
    weig = True
    filt_ls = True
    conf_filt_ls = 0.99

    # getting radius size
    tmp_geo = st.make_geoimg(ds)
    tmp_dem = GeoImg(fn_ref_dem)
    ref_dem = tmp_dem.reproject(tmp_geo)
    ref_arr = ref_dem.img
    # TODO: do not hardcore resolution
    res = 100.
    rad = int(np.floor(cutoff_kern_size / res))

    # wls
    if nproc == 1:
        print('Processing with 1 core...')
        out_arr, _ = ls_wrapper((ds_arr, 0, t_vals, err_arr, weig, filt_ls, conf_filt_ls))
    else:
        print('Processing with ' + str(nproc) + ' cores...')
        # here calculation is done matricially so we want to use all cores with the largest tiles possible
        opt_n_tiles = int(np.floor(np.sqrt(nproc)))
        n_x_tiles = opt_n_tiles
        n_y_tiles = opt_n_tiles

        pool = mp.Pool(nproc, maxtasksperchild=1)
        split_arr = splitter(ds_arr, (n_y_tiles, n_x_tiles))
        split_err = splitter(err_arr, (n_y_tiles, n_x_tiles))

        argsin = [(s, i, np.copy(t_vals), np.copy(split_err[i]), weig, filt_ls, conf_filt_ls) for i, s in
                  enumerate(split_arr)]
        outputs = pool.map(ls_wrapper, argsin, chunksize=1)
        pool.close()
        pool.join()

        zip_out = list(zip(*outputs))
        out_arr = stitcher(zip_out[0], (n_y_tiles, n_x_tiles))[0, :, :]

    # removing large dhdt outliers
    # print('Writing to rasters for checking...')
    # dh_wls = ref_dem.copy()
    # dh_wls.img = out_arr
    # dh_wls.write('/calcul/malo/hugonnet/dh_wls.tif')
    out_arr[out_arr < max_dhdt[0]] = np.nan
    out_arr[out_arr > max_dhdt[1]] = np.nan

    # finding max/min of dh/dt in a kernel
    if nproc == 1:
        print('Finding min/max dhdt in radius of ' + str(cutoff_kern_size) + 'm on 1 proc...')
        max_dhdt_arr, min_dhdt_arr = robust_maxmin_disk_filter((out_arr, rad))
    else:
        print('Finding min/max dhdt in radius of ' + str(cutoff_kern_size) + 'm on ' + str(nproc) + ' procs...')
        nopt = int(np.floor(np.sqrt(nproc)))
        nblocks = [nopt, nopt]
        patches, inv, split = patchify(out_arr, nblocks, rad)

        pool = mp.Pool(nproc, maxtasksperchild=1)
        argsin = [(p, rad) for i, p in enumerate(patches)]
        outputs = pool.map(robust_maxmin_disk_filter, argsin, chunksize=1)
        pool.close()
        pool.join()

        zip_out = list(zip(*outputs))

        max_dhdt_arr = unpatchify(np.shape(out_arr), zip_out[0], inv, split)
        min_dhdt_arr = unpatchify(np.shape(out_arr), zip_out[1], inv, split)

    # temp: to check visually
    # print('Writing to rasters for checking...')
    # max_dh_img= ref_dem.copy()
    # max_dh_img.img = max_dhdt_arr
    # max_dh_img.write('/calcul/malo/hugonnet/test_max.tif')
    # max_dh_img.img = min_dhdt_arr
    # max_dh_img.write('/calcul/malo/hugonnet/test_min.tif')

    # using max/min dhdt to better condition spatio-temporal filtering from reference
    if nproc == 1:
        print('Finding min/max ref in radius of ' + str(cutoff_kern_size) + 'm on 1 proc...')
        max_ref_arr, min_ref_arr = maxmin_disk_filter((ref_arr, rad))
    else:
        print('Finding min/max ref in radius of ' + str(cutoff_kern_size) + 'm on ' + str(nproc) + ' procs...')
        nopt = int(np.floor(np.sqrt(nproc)))
        nblocks = [nopt, nopt]
        patches, inv, split = patchify(ref_arr, nblocks, rad)

        pool = mp.Pool(nproc, maxtasksperchild=1)
        argsin = [(p, rad) for i, p in enumerate(patches)]
        outputs = pool.map(maxmin_disk_filter, argsin, chunksize=1)
        pool.close()
        pool.join()

        zip_out = list(zip(*outputs))

        max_ref_arr = unpatchify(np.shape(ref_arr), zip_out[0], inv, split)
        min_ref_arr = unpatchify(np.shape(ref_arr), zip_out[1], inv, split)

    max_abs_dhdt = np.nanmax(np.stack((np.abs(min_dhdt_arr), np.abs(max_dhdt_arr))), axis=0)

    # max_dh_img.img = max_abs_dhdt
    # max_dh_img.write('/calcul/malo/hugonnet/max_abs.tif')

    min_dhdt_filt = np.nanmin(np.stack((np.zeros(np.shape(min_dhdt_arr)), min_dhdt_arr)), axis=0)
    max_dhdt_filt = np.nanmax(np.stack((np.zeros(np.shape(max_dhdt_arr)), max_dhdt_arr)), axis=0)

    # max_dh_img.img = min_dhdt_filt
    # max_dh_img.write('/calcul/malo/hugonnet/min_filt.tif')

    # max_dh_img.img = max_dhdt_filt
    # max_dh_img.write('/calcul/malo/hugonnet/max_filt.tif')

    print('Refining spatio-temporal filtering with trend values...')
    # spatial filtering refined with temporal approx of dhdt
    print('Initial valid pixels: ' + str(np.count_nonzero(~np.isnan(ds_arr))))

    for i in range(ds_arr.shape[0]):
        ds_arr[i, np.logical_or(ds_arr[i, :] > (max_ref_arr + base_thresh + 30 * max_abs_dhdt),
                                ds_arr[i, :] < (min_ref_arr - base_thresh - 30 * max_abs_dhdt))] = np.nan

    print('Pixels after refined spatial filtering: ' + str(np.count_nonzero(~np.isnan(ds_arr))))
    # temporal filtering refined with temporal approx of dhdt
    delta_t = (ref_dem_date - t_vals).astype('timedelta64[D]').astype(float) / 365.24
    dh = ref_arr[None, :, :] - ds_arr
    dt_arr = np.ones(dh.shape) * delta_t[:, None, None]
    ds_arr[np.logical_or(np.logical_and(dt_arr < 0, np.logical_or(
        dh < - (base_thresh + max_deltat_ref * max_abs_dhdt[None, :, :]) + dt_arr * 2 * max_abs_dhdt[None, :, :],
        dh > (base_thresh + max_deltat_ref * max_abs_dhdt[None, :, :]) + dt_arr * 2 * (-max_abs_dhdt[None, :, :]))),
                         np.logical_and(dt_arr > 0, np.logical_or(
                             dh > (base_thresh + max_deltat_ref * max_abs_dhdt[None, :, :]) + dt_arr * 2 * max_abs_dhdt[
                                                                                                           None, :, :],
                             dh < - (base_thresh + max_deltat_ref * max_abs_dhdt[None, :, :]) + dt_arr * 2 * (
                                 -max_abs_dhdt[None, :, :]))))] = np.nan

    print('Pixels after refined temporal filtering: ' + str(np.count_nonzero(~np.isnan(ds_arr))))

    return ds_arr


[docs]def fit_stack(fn_stack, fit_extent=None, fn_ref_dem=None, ref_dem_date=None, filt_ref='min_max',
              time_filt_thresh=[-30, 30], inc_mask=None, exc_mask=None, nproc=1, method='gpr',
              opt_gpr=False, kernel=None, filt_ls=False, conf_filt_ls=0.99, tlim=None, tstep=0.25, outfile='fit.nc',
              write_filt=False, clobber=False, merge_date=False, dask_parallel=False):
    """
    Given a netcdf stack of DEMs, perform filtering and temporal fitting of elevation with uncertainty propagation

    :param fn_stack: Filename for input netcdf file
    :param fit_extent: extent over which to limit fit, as [xmin, xmax, ymin, ymax]
    :param fn_ref_dem: Filename for input reference DEM
    :param ref_dem_date: Date of reference DEM
    :param filt_ref: Type of filtering. One of 'min_max', 'time', or 'both'. Requires a reference DEM.
    :param time_filt_thresh: Maximum dh/dt from reference DEM for time filtering
    :param inc_mask: Optional inclusion mask. Pixels outside of the mask will not be fit.
    :param exc_mask: Optional exclusion mask. Pixels inside of the mask will not be fit.
    :param nproc: Number of cores for multiprocessing [1]
    :param method: Fitting method, currently supported: Gaussian Process Regression "gpr", Ordinary Least Squares "ols"
     and Weighted Least Squares "wls" ["gpr"]
    :param opt_gpr: Run learning optimization in the GPR fitting [False]
    :param kernel: Kernel
    :param filt_ls: Filter least square with a first fit [False]
    :param conf_filt_ls: Confidence interval to filter least square fit [99%]
    :param tlim: time range [t_min t_max] over which to fit. Default is full range of stack.
    :param tstep: Temporal step for fitted stack [0.25 year]
    :param outfile: Path to outfile
    :param write_filt: Write filtered stack to file
    :param clobber: Overwrite existing output files
    :param merge_date: Merge any DEMs which have the same acquisition date
    :param dask_parallel: run with dask parallel tools
    :return:
    """

    assert method in ['gpr', 'ols', 'wls'], "Method must be one of gpr, ols or wls."
    print('Reading dataset: ' + fn_stack)

    start = time.time()

    ds = xr.open_dataset(fn_stack)
    ds.load()

    if fit_extent is not None:
        xmin, xmax, ymin, ymax = fit_extent
        ds = ds.sel(x=slice(xmin, xmax), y=slice(ymin, ymax))

    print('Original temporal size of stack is ' + str(ds.time.size))
    print('Original spatial size of stack is ' + str(ds.x.size) + ',' + str(ds.y.size))

    if inc_mask is not None:
        # ds_orig = ds.copy()
        ds = isel_maskout(ds, inc_mask)
        if ds is None:
            print('Inclusion mask has no valid pixels in this extent. Skipping...')
            return
        print('Temporal size of stack is now: ' + str(ds.time.size))
        print('Spatial size of stack is now: ' + str(ds.x.size) + ',' + str(ds.y.size))
        print('Elapsed time is ' + str(time.time() - start))

    print('Filtering with max RMSE of 20...')

    keep_vals = ds.uncert.values < 20
    ds = ds.isel(time=keep_vals)
    print('Temporal size of stack is now: ' + str(ds.time.size))

    if merge_date:
        print('Merging ASTER DEMs with exact same date...')
        ds = isel_merge_dupl_dates(ds)

    print('Final temporal size of stack is ' + str(ds.time.size))
    print('Elapsed time is ' + str(time.time() - start))

    if exc_mask is not None:
        uns_mask = get_stack_mask(exc_mask, ds)
        uns_arr = uns_mask[np.newaxis, :, :]
    else:
        uns_arr = None

    ds_arr = ds.z.values
    ds_corr = ds.corr.values
    # change correlation for SETSM segments
    ind_setsm = np.array(['SETSM' in name for name in ds.dem_names.values])
    ds_corr[ind_setsm, :] = 60.
    t_vals = ds.time.values
    uncert = ds.uncert.values
    filt_vals = (t_vals - np.datetime64('2000-01-01')).astype('timedelta64[D]').astype(int)

    # pre-filtering
    if fn_ref_dem is not None:
        assert filt_ref in ['min_max', 'time', 'both'], "fn_ref must be one of: min_max, time, both"
        ds_arr = prefilter_stack(ds, ds_arr, fn_ref_dem, t_vals, filt_ref=filt_ref, ref_dem_date=ref_dem_date,
                                 time_filt_thresh=time_filt_thresh, nproc=nproc)
        print('Elapsed time is ' + str(time.time() - start))

    # constrain variance based on manually defined dependencies
    err_arr, med_slope = constrain_var_slope_corr(ds, ds_arr, ds_corr, t_vals, uncert, fn_ref_dem=fn_ref_dem,
                                                  nproc=nproc)

    if fn_ref_dem is not None:
        ds_arr = robust_wls_ref_filter_stack(ds, ds_arr, err_arr, t_vals, fn_ref_dem,
                                             ref_dem_date=np.datetime64('2013-01-01'),
                                             max_dhdt=time_filt_thresh, nproc=nproc, cutoff_kern_size=1000,
                                             max_deltat_ref=2.,
                                             base_thresh=100.)

    # write variance stats to disk
    if exc_mask is not None:
        print('Deriving variance stats...')
        bins_slope = np.arange(0, 80, 10)
        # fn_stats_slope = os.path.join(os.path.dirname(outfile),os.path.splitext(os.path.basename(outfile))[0]+'_slope_var.csv')
        # get_var_by_bin(ds,ds_arr,med_slope,bins_slope,fn_stats_slope,inc_mask=None,exc_mask=gla_mask,rast_mask_cube=False)

        bins_corr = np.arange(0, 105, 10)
        # fn_stats_corr = os.path.join(os.path.dirname(outfile),
        #                               os.path.splitext(os.path.basename(outfile))[0] + '_corr_var.csv')
        # get_var_by_bin(ds, ds_arr, ds_corr, bins_corr, fn_stats_corr, exc_mask=gla_mask, rast_mask_cube=True)

        fn_stats_both = os.path.join(os.path.dirname(outfile),
                                     os.path.splitext(os.path.basename(outfile))[0] + 'slopecorr_var.csv')
        get_var_by_corr_slope_bins(ds, ds_arr, med_slope, bins_slope, ds_corr, bins_corr, fn_stats_both, inc_mask=None,
                                   exc_mask=exc_mask, nproc=1)
        print('Elapsed time is ' + str(time.time() - start))

        # #TODO: need to estimate dh here first?
        # fn_vgm = os.path.join(os.path.dirname(outfile), os.path.splitext(os.path.basename(outfile))[0] + '_elev_vgm.csv')
        # fn_vgm = os.path.join(os.path.dirname(outfile), os.path.splitext(os.path.basename(outfile))[0] + '_slope_vgm.csv')
        # get_vgm_by_bin(arr_vals, bin_vals, fn_prefilt_stack, outfile, inc_mask=None, exc_mask=None, nproc=1)

    # define temporal prediction output vector
    if tlim is None:
        y0 = t_vals[0].astype('datetime64[D]').astype(object).year
        y1 = t_vals[-1].astype('datetime64[D]').astype(object).year + 1.1
    else:
        y0 = tlim[0].astype('datetime64[D]').astype(object).year
        y1 = tlim[-1].astype('datetime64[D]').astype(object).year
    fit_t = np.arange(y0, y1 + tstep, tstep) - y0
    nice_fit_t = [np.timedelta64(int(d), 'D').astype(int) for d in np.round(fit_t * 365.2524)]

    # converting time values for input vector
    ftime = t_vals
    total_delta = np.datetime64('{}-01-01'.format(int(y1))) - np.datetime64('{}-01-01'.format(int(y0)))
    ftime_delta = np.array([t - np.datetime64('{}-01-01'.format(int(y0))) for t in ftime])
    time_vals = (ftime_delta / total_delta) * (int(y1) - int(y0))

    print(time_vals)

    print('Fitting with method: ' + method)

    fn_filt = os.path.join(os.path.dirname(outfile), os.path.splitext(os.path.basename(fn_stack))[0] + '_filtered.nc')

    if nproc == 1:
        print('Processing with 1 core...')
        if method == 'gpr':
            out_cube, filt_cube = gpr_wrapper((ds_arr, 0, time_vals, err_arr, fit_t, opt_gpr, kernel, uns_arr))
            cube_to_stack(ds, out_cube, y0, nice_fit_t, slope_arr=med_slope, outfile=outfile, clobber=clobber)
            if write_filt:
                cube_to_stack(ds, filt_cube, y0, filt_vals, outfile=fn_filt, clobber=clobber, filt_bool=True, ci=False)
        elif method in ['ols', 'wls']:
            if method == 'ols':
                weig = False
            else:
                weig = True
            out_arr, filt_cube = ls_wrapper((ds_arr, 0, t_vals, uncert, weig, filt_ls, conf_filt_ls))
            arr_to_img(ds, out_arr, outfile=outfile)
            if write_filt:
                cube_to_stack(ds, filt_cube, y0, filt_vals, outfile=fn_filt, clobber=clobber, filt_bool=True, ci=False)
    else:
        print('Processing with ' + str(nproc) + ' cores...')

        # if not using dask scheduler/chunking, we split manually and process with numba.jit + multiprocessing.pool+map
        if not dask_parallel:
            print('Using multiprocessing...')

            if method in ['ols', 'wls']:
                # here calculation is done matricially so we want to use all cores with the largest tiles possible
                opt_n_tiles = int(np.floor(np.sqrt(nproc)))
                n_x_tiles = opt_n_tiles
                n_y_tiles = opt_n_tiles
            elif method == 'gpr':
                # here calculation is within a for loop: better to have small tiles to get an idea of the processing speed
                n_x_tiles = np.ceil(ds['x'].shape[0] / 30).astype(int)  # break it into 10x10 tiles
                n_y_tiles = np.ceil(ds['y'].shape[0] / 30).astype(int)

                pool = mp.Pool(nproc, maxtasksperchild=1)
                split_arr = splitter(ds_arr, (n_y_tiles, n_x_tiles))
                split_err = splitter(err_arr, (n_y_tiles, n_x_tiles))

                if uns_arr is not None:
                    split_uns = splitter(uns_arr, (n_y_tiles, n_x_tiles))
                else:
                    split_uns = [None] * len(split_arr)

            if method == 'gpr':

                argsin = [(s, i, np.copy(time_vals), split_err[i], np.copy(fit_t), opt_gpr, kernel, split_uns[i]) for
                          i, s in
                          enumerate(split_arr)]
                outputs = pool.map(gpr_wrapper, argsin, chunksize=1)
                pool.close()
                pool.join()

                # this was for when mapped function was giving multiple outputs..
                # zip_out = list(zip(*outputs))
                # out_cube = stitcher(zip_out[0], (n_y_tiles, n_x_tiles))
                # filt_cube = stitcher(zip_out[1], (n_y_tiles, n_x_tiles))

                stitched_outputs = stitcher(outputs, (n_y_tiles, n_x_tiles))

            elif method in ['ols', 'wls']:
                if method == 'ols':
                    weig = False
                else:
                    weig = True
                argsin = [(s, i, np.copy(t_vals), np.copy(uncert), weig, filt_ls, conf_filt_ls) for i, s in
                          enumerate(split_arr)]
                outputs = pool.map(ls_wrapper, argsin, chunksize=1)
                pool.close()
                pool.join()

                zip_out = list(zip(*outputs))

                out_arr = stitcher(zip_out[0], (n_y_tiles, n_x_tiles))
                arr_to_img(ds, out_arr, outfile=outfile)
                if write_filt:
                    filt_cube = stitcher(zip_out[1], (n_y_tiles, n_x_tiles))
                    cube_to_stack(ds, filt_cube, y0, filt_vals, outfile=fn_filt, clobber=clobber, filt_bool=True,
                                  ci=False)

        # here we use dask instead, testing only with gpr for now
        else:
            print('Using dask distributed parallel, computing with chunks sizes of 30...')
            print('Elapsed time is ' + str(time.time() - start))

            if method == 'gpr':

                ds['err'] = (['time', 'y', 'x'], err_arr.astype(np.float32))
                # TODO: look at the details of issue before posting on xarray's GitHub: dem_names dtype "object" not recognized for writing after an "isel"
                ds = ds.drop('dem_names')

                print('Saving data to temporary file...')
                fn_tmp = os.path.join(os.path.dirname(outfile), 'tmp.nc')

                mkdir_p(os.path.dirname(fn_tmp))
                if os.path.exists(fn_tmp):
                    os.remove(fn_tmp)
                ds.to_netcdf(fn_tmp)
                ds.close()

                # TODO: getting an issue similar than this: https://github.com/pydata/xarray/issues/1836, but:
                # persist takes ages to load the data and it happens even without zipping netcdf...

                # ds_dask = xr.open_dataset(fn_tmp).chunk({'x':30,'y':30})
                # ds_dask = ds_dask.persist()

                # we load the data directly to avoid all this trouble
                print('Loading data...')

                ds_dask = xr.open_dataset(fn_tmp)

                z_vals = ds_dask.z.values
                err_vals = ds_dask.err.values

                chunk_z = ds_dask.z.chunk({'x': 150, 'y': 150})
                chunk_err = ds_dask.err.chunk({'x': 150, 'y': 150})

                print('Elapsed time is ' + str(time.time() - start))

                # client = dask.distributed.Client()

                d = gpr_dask(chunk_z, chunk_err, time_vals, fit_t)
                print('Starting compute')
                with ProgressBar():
                    out_ds = d.compute(num_workers=nproc, scheduler='processes')
                out_ds = out_ds.transpose('new_time', 'y', 'x')
                stitched_outputs = out_ds.values

        print('Elapsed time is ' + str(time.time() - start))

        print('Writing results to disk...')

        # now we write results to disk
        out_cube = stitched_outputs[:2 * len(nice_fit_t), :, :]
        cube_to_stack(ds, out_cube, y0, nice_fit_t, slope_arr=med_slope, outfile=outfile, clobber=clobber)
        if write_filt:
            filt_cube = stitched_outputs[2 * len(nice_fit_t):, :, :]
            cube_to_stack(ds, filt_cube, y0, filt_vals, outfile=fn_filt, clobber=clobber, filt_bool=True,
                          ci=False)

        print('Elapsed time is ' + str(time.time() - start))

        # to write back to full extent even if the dataset was subsetted spatially at the top
        # if inc_mask is not None:
        #     full_out_cube = np.zeros((np.shape(out_cube)[0],ds_orig.y.size,ds_orig.x.size)) * np.nan
        #     full_out_cube[slices[0],slices[1]] = out_cube
        #     out_cube = full_out_cube
        #     ds=ds_orig
        # if inc_mask is not None:
        #     full_filt_cube = np.zeros((np.shape(filt_cube)[0], ds_orig.y.size, ds_orig.x.size)) * np.nan
        #     full_filt_cube[slices[0], slices[1]] = filt_cube
        #     filt_cube = full_filt_cube