Source code for pyddem.tdem_tools

"""
pymmaster.tdem_tools provides tools to post-process DEM stacks: volume integration over specific outlines, comparison to point data, spatial aggregation...
"""
import xarray as xr
import os
import sys
import shutil
import numpy as np
from itertools import groupby
from operator import itemgetter
import multiprocessing as mp
from scipy.interpolate import interp1d
import gdal
import osr
import ogr
import pyproj
import time
from datetime import datetime as dt
import pandas as pd
import pyddem.stack_tools as st
import pyddem.vector_tools as vt
import pyddem.fit_tools as ft
import pyddem.spstats_tools as spt
import pyddem.volint_tools as volt
from pybob.coreg_tools import get_slope, create_stable_mask, dem_coregistration
from pybob.GeoImg import GeoImg
from pybob.ICESat import ICESat
from glob import glob
import itertools

[docs]def inters_feat_shp_stacks(fn_shp, list_fn_stack, feat_field_name): """ Find all intersecting stacks for features of a shapefile, for a list of netCDF stacks (tiles for example) :param fn_shp: Filename of shapefile :param list_fn_stack: List of filenames of netCDF stacks :param feat_field_name: Name of shapefile feature to sort by (e.g., RGIIDs for RGI glaciers) :returns: List of all features found, Corresponding list of intersecting stacks per feature """ # get intersecting rgiid for each stack extent list_list_rgiid = [] for fn_stack in list_fn_stack: #TODO: temporary test # ds = xr.open_dataset(fn_stack) # extent, proj = st.extent_stack(ds) ds = xr.open_dataset(fn_stack) tile_name = st.tilename_stack(ds) lat, lon = vt.SRTMGL1_naming_to_latlon(tile_name) extent = [lon, lat, lon+1, lat+1] proj = osr.SpatialReference() proj.ImportFromEPSG(4326) list_rgiid = vt.list_shp_field_inters_extent(fn_shp, feat_field_name, extent, proj.ExportToWkt()) list_list_rgiid.append(list_rgiid) # get all rgiids intersecting all stacks without duplicates all_rgiid = [] for list_rgiid in list_list_rgiid: all_rgiid = all_rgiid + list_rgiid all_rgiid = list(set(all_rgiid)) # inverting to have intersecting stacks by rgiid: more practical to compute that way list_list_stack_by_rgiid = [] for rgiid in all_rgiid: list_stacks = [] for fn_stack in list_fn_stack: if rgiid in list_list_rgiid[list_fn_stack.index(fn_stack)]: list_stacks.append(fn_stack) list_list_stack_by_rgiid.append(sorted(list_stacks)) return all_rgiid, list_list_stack_by_rgiid
[docs]def sel_dc(ds, tlim, mask): """ Subset datacube in space and time with a spatial mask and time limits :param ds: xarray Dataset :param tlim: Time limits :param mask: Raster boolean mask :returns: Subset Dataset, Subset boolean mask, Spatial index slices (to identify subset position in the original datacube, for instance) """ # select data cube on temporal and spatial mask if tlim is None: time1 = time2 = None else: time1, time2 = tlim index = np.where(mask) minx = ds.x[np.min(index[1])].values maxx = ds.x[np.max(index[1])].values miny = ds.y[np.min(index[0])].values maxy = ds.y[np.max(index[0])].values dc = ds.sel(dict(time=slice(time1, time2), x=slice(minx, maxx), y=slice(miny, maxy))) submask = mask[np.min(index[0]):np.max(index[0] + 1), np.min(index[1]):np.max(index[1] + 1)] return dc, submask, (slice(minx,maxx),slice(miny,maxy))
[docs]def int_dc(dc, mask, **kwargs): """ Wrapper to for classic integration of elevation time series into volume change time series :param dc: xarray Dataset (subset) :param mask: Raster boolean mask :returns: DataFrame of integrated elevation change """ # integrate data cube over masked area dh = dc.variables['z'].values - dc.variables['z'].values[0] err = np.sqrt(dc.variables['z_ci'].values ** 2 + dc.variables['z_ci'].values[0] ** 2) ref_elev = dc.variables['z'].values[0] slope = get_slope(st.make_geoimg(dc, 0)).img slope[np.logical_or(~np.isfinite(slope), slope > 70)] = np.nan t, y, x = np.shape(dh) dx = np.round((dc.x.max().values - dc.x.min().values) / float(x)) df = volt.hypso_dc(dh,err,ref_elev,mask,dx,**kwargs) return df
def get_inters_stack_dem(list_fn_stack,ext,proj): # get stacks that intersect the extent poly = vt.poly_from_extent(ext) list_inters_stack = [] for fn_stack in list_fn_stack: ds = xr.open_dataset(fn_stack) ext_st, proj_st = st.extent_stack(ds) poly_st = vt.poly_from_extent(ext_st) trans = vt.coord_trans(True, proj_st, True, proj) poly_st.Transform(trans) if poly_st.Intersect(poly): list_inters_stack.append(fn_stack) return list_inters_stack def comp_stacks_dem(list_fn_stack,fn_dem,inc_mask=None, exc_mask=None, get_timelapse_filtstack=False, outfile=None): #get list of intersecting stacks ext, proj = vt.extent_rast(fn_dem) list_inters_stack = get_inters_stack_dem(list_fn_stack,ext,proj) dem_full = GeoImg(fn_dem) list_dh = list_z_score = list_dt = [] for fn_stack in list_inters_stack: ds = xr.open_dataset(fn_stack) ref_img = st.make_geoimg(ds) mask = create_stable_mask(ref_img,exc_mask,inc_mask) dem = dem_full.reproject(ref_img) x,y = np.where(mask) t = np.array([dem.datetime]*len(x)) dem_sub = dem[x,y] comp = ds.interp(time=t,x=x,y=y) comp_h = comp.variables['z'].values comp_ci = comp.variables['z_ci'].values dh = dem_sub - comp_h z_score = dh / comp_ci list_dh.append(dh) list_z_score.append(z_score) if get_timelapse_filtstack: #here we assume filtered stack is stored at the same location fn_filt = os.path.join(os.path.dirname(fn_stack),os.path.basename(fn_stack).split('_')[0]+'_filtered.nc') ds_filt = xr.open_dataset(fn_filt) ds_near = ds_filt.isel(time=t,x=x,y=y,method='nearest') near_times = ds_near.time.values delta_t = near_times - t list_dt.append(delta_t) full_dh = np.concatenate(list_dh) full_z_score = np.concatenate(list_z_score) full_dt = np.concatenate(list_dt) df = pd.DataFrame() df = df.assign(dh=full_dh,z_score=full_z_score,dt=full_dt) if outfile is None: return df else: df.to_csv(outfile) def datetime_to_yearfraction(date): #ref:https://stackoverflow.com/questions/6451655/python-how-to-convert-datetime-dates-to-decimal-years def sinceEpoch(date): #returns seconds since epoch return time.mktime(date.timetuple()) s = sinceEpoch year = date.year startOfThisYear = dt(year=year, month=1, day=1) startOfNextYear = dt(year=year+1, month=1, day=1) yearElapsed = s(date) - s(startOfThisYear) yearDuration = s(startOfNextYear) - s(startOfThisYear) fraction = yearElapsed/yearDuration return date.year + fraction def create_tmp_dir_for_outfile(file_out): tmp_dir = os.path.join(os.path.dirname(file_out), 'tmp_'+os.path.splitext(os.path.basename(file_out))[0]) + os.path.sep if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) return tmp_dir def remove_tmp_dir_for_outfile(file_out): tmp_dir = os.path.join(os.path.dirname(file_out), 'tmp_'+os.path.splitext(os.path.basename(file_out))[0]) + os.path.sep if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir,ignore_errors=True)
[docs]def saga_aspect_slope_curvature(dem_in, topo_out, method_nb=5): """ Derive maximum curvature using SAGA (command line) :param dem_in: dem input :param topo_out: raster out (3 bands: aspect, slope, curvature) :param method_nb: algorithm used, see function description :return: requirement: SAGA 2.X with X>3 #ref:http://www.saga-gis.org/saga_tool_doc/2.2.3/ta_morphometry_0.html aspect, slope, curvature methods methods number [0] maximum slope (Travis et al. 1975) [1] maximum triangle slope (Tarboton 1997) [2] least squares fitted plane (Horn 1981, Costa-Cabral & Burgess 1996) [3] 6 parameter 2nd order polynom (Evans 1979) [4] 6 parameter 2nd order polynom (Heerdegen & Beran 1982) [5] 6 parameter 2nd order polynom (Bauer, Rohdenburg, Bork 1985) [6] 9 parameter 2nd order polynom (Zevenbergen & Thorne 1987) [7] 10 parameter 3rd order polynom (Haralick 1983) unit slope 0=rad, 1=deg, 2=percent unit aspect 0=rad, 1=deg """ tmp_dir = create_tmp_dir_for_outfile(topo_out) saga_elev = tmp_dir + 'elev_temp.sgrd' saga_slope = tmp_dir + 'slope_temp.sgrd' saga_aspect = tmp_dir + 'aspect_temp.sgrd' saga_max_curv = tmp_dir + 'max_curv_temp.sgrd' tif_slope = tmp_dir + 'slope_temp.tif' tif_aspect = tmp_dir + 'aspect_temp.tif' tif_max_curv = tmp_dir + 'max_curv_temp.tif' output_vrt = tmp_dir + 'stack.vrt' os.system('saga_cmd io_gdal 0 -GRIDS ' + saga_elev + ' -FILES ' + dem_in) os.system( 'saga_cmd ta_morphometry 0 -ELEVATION ' + saga_elev + ' -SLOPE ' + saga_slope + ' -ASPECT ' + saga_aspect + ' -C_MAXI ' + saga_max_curv + ' -METHOD ' + str( method_nb) + ' -UNIT_SLOPE 1 -UNIT_ASPECT 1') os.system('saga_cmd io_gdal 2 -GRIDS ' + saga_slope + ' -FILE ' + tif_slope) os.system('saga_cmd io_gdal 2 -GRIDS ' + saga_aspect + ' -FILE ' + tif_aspect) os.system('saga_cmd io_gdal 2 -GRIDS ' + saga_max_curv + ' -FILE ' + tif_max_curv) os.system( 'gdalbuildvrt -separate -overwrite ' + output_vrt + ' ' + tif_slope + ' ' + tif_aspect + ' ' + tif_max_curv) os.system('gdal_translate ' + output_vrt + ' ' + topo_out) remove_tmp_dir_for_outfile(topo_out)
[docs]def icesat_comp_wrapper(argsin): """ Multiprocessing wrapper to compare ICESat data with elevation time series :param argsin: Tuple of arranged data (see comp_stacks_icesat) :returns: Arranged output data (see_comp_stacks_icesat) """ fn_stack,ice_coords,ice_latlon,ice_elev,ice_date,groups,dates,read_filt,gla_mask,inc_mask,exc_mask = argsin full_h, full_dh, full_z_score, full_dt, full_pos, full_slp, full_lat, full_lon, full_dh_ref, full_curv, full_dh_tot, full_time = ([] for i in range(12)) # full_time = np.array([],dtype='datetime64[D]') ds = xr.open_dataset(fn_stack) ds.load() tile_name = st.tilename_stack(ds) tmp_h = st.make_geoimg(ds) tmp_ci = st.make_geoimg(ds) tmp_dhtot = st.make_geoimg(ds) tmp_slope = st.make_geoimg(ds) tmp_slope.img = ds.slope.values tmp_dhtot.img = ds.z[-1, :].values - ds.z[0, :].values # tmp_slope.img = np.ones(np.shape(tmp_slope.img))*20 ds_sub = ds.interp(time=dates) terrain_mask = None get_curv = True #glacier mask has priority and won't be changed, gla = 2 if gla_mask is not None: print('Tile '+tile_name+': deriving terrain glacier mask...') #rasterizing mask and changing to float to use geoimg.raster_points2 mask = ft.get_stack_mask(gla_mask,ds) tmp_exc_mask = st.make_geoimg(ds) tmp_exc_mask.img = np.zeros(np.shape(mask),dtype='float32') tmp_exc_mask.img[mask] = 2 terrain_mask = tmp_exc_mask #here we take the difference of inclusion and exclusion mask for the rest of terrain, inc = 1, exc = 0 if inc_mask is not None: print('Tile ' + tile_name + ': deriving terrain inclusion mask...') # rasterizing mask and changing to float to use geoimg.raster_points2 mask = ft.get_stack_mask(inc_mask, ds) if terrain_mask is not None: land_mask = np.logical_and(terrain_mask.img == 0, mask) terrain_mask.img[land_mask] = 1 else: tmp_inc_mask = st.make_geoimg(ds) tmp_inc_mask.img = np.zeros(np.shape(mask),dtype='float32') tmp_inc_mask[mask] = 1 terrain_mask = tmp_inc_mask if exc_mask is not None: print('Tile ' + tile_name + ': deriving terrain exclusion mask...') # rasterizing mask and changing to float to use geoimg.raster_points2 mask = ft.get_stack_mask(exc_mask, ds) if terrain_mask is not None: noland_mask = np.logical_and(terrain_mask.img <= 1, mask) terrain_mask.img[noland_mask] = 0 else: tmp_exc_mask = st.make_geoimg(ds) tmp_exc_mask.img = np.ones(np.shape(mask), dtype='float32') tmp_exc_mask[mask] = 0 terrain_mask = tmp_exc_mask fn_raw = os.path.join(os.path.dirname(fn_stack), os.path.basename(fn_stack).split('_')[0] + '.nc') ds_raw = xr.open_dataset(fn_raw) tmp_ref_h = st.make_geoimg(ds_raw,band=(slice(0,None),slice(0,None)),var='ref_z') tmp_ref_h = tmp_ref_h.reproject(tmp_h) #get curvature if get_curv: fn_tmp_ref = os.path.join(os.path.dirname(fn_stack),'tmp_ref_'+os.path.basename(fn_stack).split('_')[0]+'.tif') tmp_ref_h.write(fn_tmp_ref) fn_tmp_curv = os.path.join(os.path.dirname(fn_stack),'tmp_curv_'+os.path.basename(fn_stack).split('_')[0]+'.tif') saga_aspect_slope_curvature(fn_tmp_ref,fn_tmp_curv) ds_curv = gdal.Open(fn_tmp_curv,gdal.GA_ReadOnly) curv_arr = ds_curv.GetRasterBand(3).ReadAsArray() tmp_curv = GeoImg(fn_tmp_curv) tmp_curv.img = curv_arr ds_curv = None os.remove(fn_tmp_curv) os.remove(fn_tmp_ref) else: tmp_curv = st.make_geoimg(ds) tmp_curv.img = np.zeros(np.shape(tmp_curv.img),dtype='float32') if read_filt: print('Tile '+tile_name+': getting original data dates from filtered array...') #we read the boolean data cube indicating positions where original data was used tmp_dt = st.make_geoimg(ds) fn_filt = os.path.join(os.path.dirname(fn_stack), os.path.basename(fn_stack).split('_')[0] + '_filtered.nc') ds_filt = xr.open_dataset(fn_filt) ds_filt.load() #first, remove duplicate dates by merging boolean arrays for same dates t_vals = list(ds_filt.time.values) dates_rm_dupli = sorted(list(set(t_vals))) ind_firstdate = [] for i, date in enumerate(dates_rm_dupli): ind_firstdate.append(t_vals.index(date)) ds_filt2 = ds_filt.isel(time=np.array(ind_firstdate)) for i in range(len(dates_rm_dupli)): t_ind = (t_vals == dates_rm_dupli[i]) if len(t_ind) > 1: ds_filt2.z.values[i, :] = np.any(ds_filt.z[t_vals == dates_rm_dupli[i], :].values.astype(bool), axis=0) # getting time data as days since 2000 y0 = np.datetime64('2000-01-01') ftime = ds_filt2.time.values ftime_delta = np.array([t - y0 for t in ftime]) days = [td.astype('timedelta64[D]').astype(int) for td in ftime_delta] # reindex to get closest not-NaN time value of the date vector filt_arr = np.array(ds_filt2.z.values, dtype='float32') filt_arr[filt_arr == 0] = np.nan days = np.array(days) filt_arr = filt_arr * days[:, None, None] at_least_2 = np.count_nonzero(~np.isnan(filt_arr), axis=0) >= 2 filt_tmp = filt_arr[:, at_least_2] out_arr = np.copy(filt_tmp) for i in range(np.shape(filt_tmp)[1]): ind = ~np.isnan(filt_tmp[:, i]) fn = interp1d(days[ind], filt_tmp[:, i][ind], kind='nearest', fill_value='extrapolate', assume_sorted=True) out_arr[:, i] = fn(days) filt_arr[:, at_least_2] = out_arr ds_filt2.z.values = filt_arr ds_filt_sub = ds_filt2.reindex(time=dates, method='nearest') for i, group in enumerate(groups): #keep only a campaign group pts_idx_dt = np.array(ice_date == group) date = dates[i] print('Tile '+tile_name+': calculating differences for the ' + str( np.count_nonzero(pts_idx_dt)) + ' points of campaign:' + str(date)) subsamp_ice = [tup for i, tup in enumerate(ice_coords) if pts_idx_dt[i]] subsamp_latlon = [tup for i, tup in enumerate(ice_latlon) if pts_idx_dt[i]] tmp_h.img = ds_sub.z[i, :].values comp_pts_h = tmp_h.raster_points(subsamp_ice, nsize=3, mode='linear') tmp_ci.img = ds_sub.z_ci[i, :].values comp_pts_ci = tmp_ci.raster_points(subsamp_ice, nsize=3, mode='linear') comp_pts_dhtot = tmp_dhtot.raster_points(subsamp_ice, nsize=3, mode='linear') comp_pts_slope = tmp_slope.raster_points(subsamp_ice, nsize=3, mode='linear') comp_pts_ref_h = tmp_ref_h.raster_points(subsamp_ice, nsize=3, mode='linear') comp_pts_curv = tmp_curv.raster_points(subsamp_ice,nsize=3,mode='linear') dh = ice_elev[pts_idx_dt] - comp_pts_h good_vals = np.isfinite(dh) dh = dh[good_vals] h = ice_elev[pts_idx_dt][good_vals] z_score = dh / comp_pts_ci[good_vals] slp = comp_pts_slope[good_vals] dh_ref = ice_elev[pts_idx_dt] - comp_pts_ref_h dh_ref = dh_ref[good_vals] curv = comp_pts_curv[good_vals] dh_tot = comp_pts_dhtot[good_vals] if read_filt: day_diff = (date - y0).astype('timedelta64[D]').astype(int) tmp_dt.img = np.abs(ds_filt_sub.z[i, :].values - np.ones(np.shape(tmp_dt.img)) * day_diff) comp_pts_dt = tmp_dt.raster_points(subsamp_ice, nsize=3, mode='mean') dt_out = comp_pts_dt[good_vals] else: dt_out = np.zeros(len(dh)) * np.nan if terrain_mask is not None: comp_pts_mask = terrain_mask.raster_points(subsamp_ice, nsize=5, mode='nearest') pos = comp_pts_mask[good_vals] else: pos = np.zeros(len(dh)) good_subsamp_latlon = [tup for i, tup in enumerate(subsamp_latlon) if good_vals[i]] if len(good_subsamp_latlon) > 0: lon = np.array([tup[1] for i, tup in enumerate(good_subsamp_latlon)]) lat = np.array([tup[0] for i, tup in enumerate(good_subsamp_latlon)]) else: lat=np.array([]) lon=np.array([]) full_h.append(h) full_dh.append(dh) full_z_score.append(z_score) full_dt.append(dt_out) full_pos.append(pos) full_slp.append(slp) full_time.append(np.array([date]*len(dh),dtype='datetime64[D]')) full_curv.append(curv) full_lat.append(lat) full_lon.append(lon) full_dh_ref.append(dh_ref) full_dh_tot.append(dh_tot) full_h = np.concatenate(full_h) full_dh = np.concatenate(full_dh) full_z_score = np.concatenate(full_z_score) full_dt = np.concatenate(full_dt) full_pos = np.concatenate(full_pos) full_slp = np.concatenate(full_slp) full_time = np.concatenate(full_time) full_lat = np.concatenate(full_lat) full_lon = np.concatenate(full_lon) full_dh_ref = np.concatenate(full_dh_ref) full_curv = np.concatenate(full_curv) full_dh_tot = np.concatenate(full_dh_tot) return full_h, full_dh, full_z_score, full_dt, full_pos, full_slp, full_time, full_lon, full_lat, full_dh_ref, full_curv, full_dh_tot
[docs]def comp_stacks_icesat(list_fn_stack,fn_icesat,gla_mask=None,inc_mask=None,exc_mask=None,nproc=1,read_filt=False,shift=None): """ Compare ICESat data with elevation time series :param list_fn_stack: List of filename of netCDF to compare to :param fn_icesat: Filename of ICESat HDF5 file :param gla_mask: Filename of glacier shapefile :param inc_mask: Filename of inclusion shapefile (e.g. land) :param exc_mask: Filename of exclusion shapeifle (e.g. ocean) :param nproc: Number of cores used for multiprocessing [1] :param read_filt: Boolean, read boolean data cube of valid observation :returns: Arranged output data (see_comp_stacks_icesat) """ ice = ICESat(fn_icesat) el_limit = -200 mykeep = ice.elev > el_limit #exception for North Asia subreg 3 ICESat file if os.path.basename(fn_icesat) == 'ICESat_10_3_rgi50_NorthAsia.h5': keep_west = np.logical_and(ice.lon<-172,ice.lon>-179.999) keep_east_1 = np.logical_and(ice.lon<96,ice.lon>88) keep_east_2 = np.logical_and(ice.lon<146.999,ice.lon>138) keep_northasia_3 = np.logical_or.reduce((keep_west,keep_east_1,keep_east_2)) mykeep = np.logical_and(mykeep,keep_northasia_3) ice.x = ice.x[mykeep] ice.y = ice.y[mykeep] ice.lat = ice.lat[mykeep] ice.lon = ice.lon[mykeep] ice.elev = ice.elev[mykeep] ice.UTCTime = ice.UTCTime[mykeep] ice.xy = list(zip(ice.x, ice.y)) #putting code above in ICESat.clean() breaks co-registration... # ice.clean(el_limit=-200) #get intersecting tiles bounds = ice.get_bounds() proj = osr.SpatialReference() proj.ImportFromEPSG(4326) proj_wkt = proj.ExportToWkt() list_inters = get_inters_stack_dem(list_fn_stack,[bounds[0],bounds[2],bounds[1],bounds[3]],proj_wkt) if len(list_inters) == 0: print('Found no intersection to ICESat file: '+fn_icesat) raise ValueError('No intersecting file!') #laser campaigns of ICESat laser_op_icesat = [(dt(2003, 2, 20), dt(2003, 3, 29)), (dt(2003, 9, 25), dt(2003, 11, 19)), (dt(2004, 2, 17), dt(2004, 3, 21)), (dt(2004, 5, 18), dt(2004, 6, 21)), (dt(2004, 10, 3), dt(2004, 11, 8)), (dt(2005, 2, 17), dt(2005, 3, 24)), (dt(2005, 5, 20), dt(2005, 6, 23)), (dt(2005, 10, 21), dt(2005, 11, 24)), (dt(2006, 2, 22), dt(2006, 3, 28)), (dt(2006, 5, 24), dt(2006, 6, 26)), (dt(2006, 10, 25), dt(2006, 11, 27)), (dt(2007, 3, 12), dt(2007, 4, 14)), (dt(2007, 10, 2), dt(2007, 11, 5)), (dt(2008, 2, 17), dt(2008, 3, 21)), (dt(2008, 10, 4), dt(2008, 10, 19)), (dt(2008, 11, 25), dt(2008, 12, 17)), (dt(2009, 3, 9), dt(2009, 4, 11)), (dt(2009, 9, 30), dt(2009, 10, 11))] #campaign names laser_op_name = ['1AB', '2A', '2B', '2C', '3A', '3B', '3C', '3D', '3E', '3F', '3G', '3H', '3I', '3J', '3K', '2D', '2E', '2F'] # group ICESat dates by operation utc_days = ice.UTCTime for i in range(len(laser_op_icesat)): # datetime to UTC days start_dt = datetime_to_yearfraction(laser_op_icesat[i][0]) * 365.2422 - 10 end_dt = datetime_to_yearfraction(laser_op_icesat[i][1]) * 365.2422 + 10 mean_dt = 0.5 * (start_dt + end_dt) idx = np.logical_and(utc_days < end_dt, utc_days > start_dt) utc_days[idx] = mean_dt groups = sorted(list(set(list(utc_days)))) dates = np.array( [np.datetime64('01-01-01') + np.timedelta64(int(np.floor(group - 365.2422)), 'D') for group in groups]) #prepare icesat arrays to pass to wrapper per stack, to avoid reading HDF5/NetCDF multiple times if doing parallel icesat_argsin = [] for fn_stack in list_inters: ds = xr.open_dataset(fn_stack) tile_name = st.tilename_stack(ds) lat, lon = vt.SRTMGL1_naming_to_latlon(tile_name) pts_idx = np.logical_and.reduce((ice.lat > lat, ice.lat <= lat + 1, ice.lon > lon, ice.lon <= lon + 1)) check = np.count_nonzero(pts_idx) print('Tile ' + tile_name + ': found ' + str(check) + ' ICESat points') if check > 0: _, utm = vt.latlon_to_UTM(lat, lon) ice.project('epsg:{}'.format(vt.epsg_from_utm(utm))) ice_coords = [tup for i, tup in enumerate(ice.xy) if pts_idx[i]] ice_elev = ice.elev[pts_idx] ice_date = ice.UTCTime[pts_idx] ice_latlon = np.array(list(zip(ice.lat[pts_idx],ice.lon[pts_idx]))) icesat_argsin.append((fn_stack,np.copy(ice_coords),np.copy(ice_latlon),np.copy(ice_elev),np.copy(ice_date),np.copy(groups),np.copy(dates),read_filt,gla_mask,inc_mask,exc_mask)) if nproc == 1: list_h, list_dh, list_zsc, list_dt, list_pos, list_slp, list_time, list_lat, list_lon, list_dh_ref, list_curv, list_dh_tot = ([] for i in range(11)) for i in range(len(icesat_argsin)): tmp_h, tmp_dh, tmp_zsc, tmp_dt, tmp_pos, tmp_slp, tmp_time, tmp_lon, tmp_lat, tmp_dh_ref, tmp_curv, tmp_dh_tot = icesat_comp_wrapper(icesat_argsin[i]) list_h.append(tmp_h) list_dh.append(tmp_dh) list_zsc.append(tmp_zsc) list_dt.append(tmp_dt) list_pos.append(tmp_pos) list_slp.append(tmp_slp) list_time.append(tmp_time) list_lat.append(tmp_lat) list_lon.append(tmp_lon) list_dh_ref.append(tmp_dh_ref) list_curv.append(tmp_curv) list_dh_tot.append(tmp_dh_tot) h = np.concatenate(list_h) dh = np.concatenate(list_dh) zsc = np.concatenate(list_zsc) dt_out = np.concatenate(list_dt) pos = np.concatenate(list_pos) slp = np.concatenate(list_slp) t = np.concatenate(list_time) lat = np.concatenate(list_lat) lon = np.concatenate(list_lon) dh_ref = np.concatenate(list_dh_ref) curv = np.concatenate(list_curv) dh_tot = np.concatenate(list_dh_tot) else: nproc=min(len(icesat_argsin),nproc) print('Using '+str(nproc)+' processors...') pool = mp.Pool(nproc,maxtasksperchild=1) outputs = pool.map(icesat_comp_wrapper,icesat_argsin) pool.close() pool.join() zip_out = list(zip(*outputs)) h = np.concatenate(zip_out[0]) dh = np.concatenate(zip_out[1]) zsc = np.concatenate(zip_out[2]) dt_out = np.concatenate(zip_out[3]) pos = np.concatenate(zip_out[4]) slp = np.concatenate(zip_out[5]) t = np.concatenate(zip_out[6]) lon = np.concatenate(zip_out[7]) lat = np.concatenate(zip_out[8]) dh_ref = np.concatenate(zip_out[9]) curv = np.concatenate(zip_out[10]) dh_tot = np.concatenate(zip_out[11]) return h, dh, zsc, dt_out, pos, slp, t, lon, lat, dh_ref, curv, dh_tot
[docs]def comp_stacks_icebridge(list_fn_stack,fn_icebridge,gla_mask=None,inc_mask=None,exc_mask=None,nproc=1,read_filt=False): """ Compare IceBridge data with elevation time series :param list_fn_stack: List of filename of netCDF to compare to :param fn_icebridge: Filename of IceBridge post-processed into .csv :param gla_mask: Filename of glacier shapefile :param inc_mask: Filename of inclusion shapefile (e.g. land) :param exc_mask: Filename of exclusion shapeifle (e.g. ocean) :param nproc: Number of cores used for multiprocessing [1] :param read_filt: Boolean, read boolean data cube of valid observation :returns: Arranged output data (see_comp_stacks_icesat) """ iceb = pd.read_csv(fn_icebridge) #get intersecting tiles bounds = (iceb.lon.min(), iceb.lon.max(), iceb.lat.min(), iceb.lat.max()) proj = osr.SpatialReference() proj.ImportFromEPSG(4326) proj_wkt = proj.ExportToWkt() list_inters = get_inters_stack_dem(list_fn_stack,[bounds[0],bounds[2],bounds[1],bounds[3]],proj_wkt) if len(list_inters) == 0: print('Found no intersection to IceBridge file: '+fn_icebridge) raise ValueError('No intersecting file!') groups = sorted(list(set(list(pd.to_datetime(iceb.t.values).values.astype('datetime64[D]'))))) dates = groups #prepare icesat arrays to pass to wrapper per stack, to avoid reading HDF5/NetCDF multiple times if doing parallel icebridge_argsin = [] for fn_stack in list_inters: ds = xr.open_dataset(fn_stack) tile_name = st.tilename_stack(ds) lat, lon = vt.SRTMGL1_naming_to_latlon(tile_name) pts_idx = np.logical_and.reduce((iceb.lat > lat, iceb.lat <= lat + 1, iceb.lon > lon, iceb.lon <= lon + 1)) check = np.count_nonzero(pts_idx) print('Tile ' + tile_name + ': found ' + str(check) + ' IceBridge points') if check > 0: _, utm = vt.latlon_to_UTM(lat, lon) dest_proj ='epsg:{}'.format(vt.epsg_from_utm(utm)) dest_proj = pyproj.Proj(init=dest_proj) wgs84 = pyproj.Proj(init='epsg:4326') x, y = pyproj.transform(wgs84, dest_proj, iceb.lon.values, iceb.lat.values) xy = list(zip(x,y)) ice_coords = [tup for i, tup in enumerate(xy) if pts_idx[i]] ice_elev = iceb.h[pts_idx].values ice_date = pd.to_datetime(iceb.t[pts_idx].values).values.astype('datetime64[D]') ice_latlon = np.array(list(zip(iceb.lat[pts_idx].values,iceb.lon[pts_idx].values))) icebridge_argsin.append((fn_stack,np.copy(ice_coords),np.copy(ice_latlon),np.copy(ice_elev),np.copy(ice_date),np.copy(groups),np.copy(dates),read_filt,gla_mask,inc_mask,exc_mask)) if nproc == 1: list_h, list_dh, list_zsc, list_dt, list_pos, list_slp, list_time, list_lat, list_lon, list_dh_ref, list_curv, list_dh_tot = ([] for i in range(11)) for i in range(len(icebridge_argsin)): tmp_h, tmp_dh, tmp_zsc, tmp_dt, tmp_pos, tmp_slp, tmp_time, tmp_lon, tmp_lat, tmp_dh_ref, tmp_curv, tmp_dh_tot = icesat_comp_wrapper(icebridge_argsin[i]) list_h.append(tmp_h) list_dh.append(tmp_dh) list_zsc.append(tmp_zsc) list_dt.append(tmp_dt) list_pos.append(tmp_pos) list_slp.append(tmp_slp) list_time.append(tmp_time) list_lat.append(tmp_lat) list_lon.append(tmp_lon) list_dh_ref.append(tmp_dh_ref) list_curv.append(tmp_curv) list_dh_tot.append(tmp_dh_tot) h = np.concatenate(list_h) dh = np.concatenate(list_dh) zsc = np.concatenate(list_zsc) dt_out = np.concatenate(list_dt) pos = np.concatenate(list_pos) slp = np.concatenate(list_slp) t = np.concatenate(list_time) lat = np.concatenate(list_lat) lon = np.concatenate(list_lon) dh_ref = np.concatenate(list_dh_ref) curv = np.concatenate(list_curv) dh_tot = np.concatenate(list_dh_tot) else: nproc=min(len(icebridge_argsin),nproc) print('Using '+str(nproc)+' processors...') pool = mp.Pool(nproc,maxtasksperchild=1) outputs = pool.map(icesat_comp_wrapper,icebridge_argsin) pool.close() pool.join() zip_out = list(zip(*outputs)) h = np.concatenate(zip_out[0]) dh = np.concatenate(zip_out[1]) zsc = np.concatenate(zip_out[2]) dt_out = np.concatenate(zip_out[3]) pos = np.concatenate(zip_out[4]) slp = np.concatenate(zip_out[5]) t = np.concatenate(zip_out[6]) lon = np.concatenate(zip_out[7]) lat = np.concatenate(zip_out[8]) dh_ref = np.concatenate(zip_out[9]) curv = np.concatenate(zip_out[10]) dh_tot = np.concatenate(zip_out[11]) return h, dh, zsc, dt_out, pos, slp, t, lon, lat, dh_ref, curv, dh_tot
def shift_icesat_stack(fn_ref,fn_icesat,fn_shp): _ , _ , shift_params, stats = dem_coregistration(fn_icesat,fn_ref,glaciermask=fn_shp,pts=True,inmem=True) return shift_params, stats def combine_postproc_stacks_tvol(list_fn_stack, fn_shp, feat_id='RGIId', tlim=None, write_combined=True, outdir='.'): # get all rgiid intersecting stacks and the list of intersecting stacks all_rgiids, list_list_stacks = inters_feat_shp_stacks(fn_shp, list_fn_stack, feat_id) # sort by rgiid group with same intersecting stacks list_tuples = list(zip(all_rgiids, list_list_stacks)) grouped = [(k, list(list(zip(*g))[0])) for k, g in groupby(list_tuples, itemgetter(1))] # loop through similar combination of stacks (that way, only have to combine/open them once) for i in range(len(grouped)): list_fn_stack_pack = grouped[i][0] rgiid_pack = grouped[i][1] list_ds = st.open_datasets(list_fn_stack_pack) if len(list_ds) > 1: ds = st.combine_stacks(list_ds) else: ds = list_ds[0] if write_combined: list_tile = [os.path.splitext(os.path.basename(fn))[0].split('_')[0] for fn in list_fn_stack_pack] out_nc = os.path.join(outdir, 'combined_stacks', '_'.join(list_tile)) ds.to_netcdf(out_nc) df_tot = pd.DataFrame() # loop through rggiids for rgiid in rgiid_pack: ds_shp = gdal.OpenEx(fn_shp, gdal.OF_VECTOR) layer_name = os.path.splitext(os.path.basename(fn_shp))[0] geoimg = st.make_geoimg(ds, 0) mask = vt.geoimg_mask_on_feat_shp_ds(ds_shp, geoimg, layer_name=layer_name, feat_id=feat_id, feat_val=rgiid) dc, submask, _ = sel_dc(ds, tlim, mask) df = int_dc(dc, submask) df_tot.append(df)
[docs]def get_dt_closest_valid(ds_filt,dates): """ Derive data cube of time lag to closest valid observation based on the date vector of the time series and a boolean data cube of valid observation at original dates :param ds_filt: Boolean data cube of valid observations :param dates: Time vector of time series :returns: Data cube indexed to time vector with time lags to observation, Data cube with count of valid observation between time index, Data cube with yearly count of valid observation at the first yearly time index """ # we read the boolean data cube indicating positions where original data was used # first, remove duplicate dates by merging boolean arrays for same dates t_vals = list(ds_filt.time.values) dates_rm_dupli = sorted(list(set(t_vals))) ind_firstdate = [] for i, date in enumerate(dates_rm_dupli): ind_firstdate.append(t_vals.index(date)) ds_filt2 = ds_filt.isel(time=np.array(ind_firstdate)) for i in range(len(dates_rm_dupli)): t_ind = (t_vals == dates_rm_dupli[i]) if len(t_ind) > 1: ds_filt2.z.values[i, :] = np.any(ds_filt.z[t_vals == dates_rm_dupli[i], :].values.astype(bool), axis=0) valid_obs = np.zeros((len(dates),ds_filt2.z.shape[1],ds_filt2.z.shape[2])) for i in range(len(dates)-1): ind = np.logical_and(dates_rm_dupli>=dates[i],dates_rm_dupli<dates[i+1]) valid_obs[i,:,:] = np.sum(ds_filt2.z.values[ind,:],axis=0) #not robust, works only for monthly (pressé!) valid_obs_peryear = np.zeros((len(dates), ds_filt2.z.shape[1], ds_filt2.z.shape[2])) for i in range(len(dates) - 1): if i % 12 == 0: ind = np.logical_and(dates_rm_dupli >= dates[i], dates_rm_dupli < dates[i + 12]) valid_obs_peryear[i, :, :] = np.any(ds_filt2.z.values[ind, :], axis=0) # getting time data as days since 2000 y0 = np.datetime64('2000-01-01') ftime = ds_filt2.time.values ftime_delta = np.array([t - y0 for t in ftime]) days = [td.astype('timedelta64[D]').astype(int) for td in ftime_delta] # reindex to get closest not-NaN time value of the date vector filt_arr = np.array(ds_filt2.z.values, dtype='float32') filt_arr[filt_arr == 0] = np.nan days = np.array(days) filt_arr = filt_arr * days[:, None, None] at_least_2=np.count_nonzero(~np.isnan(filt_arr),axis=0)>=2 filt_tmp = filt_arr[:,at_least_2] out_arr = np.copy(filt_tmp) for i in range(np.shape(filt_tmp)[1]): ind = ~np.isnan(filt_tmp[:,i]) fn = interp1d(days[ind],filt_tmp[:,i][ind],kind='nearest',fill_value='extrapolate',assume_sorted=True) out_arr[:,i] = fn(days) filt_arr[:,at_least_2] = out_arr ds_filt2.z.values = filt_arr ds_filt_sub = ds_filt2.reindex(time=dates, method='nearest') for i in range(len(dates)): date = dates[i] day_diff = (date - y0).astype('timedelta64[D]').astype(int) ds_filt_sub.z.values[i,:] = np.abs(ds_filt_sub.z.values[i, :] - np.ones(ds_filt_sub.z.shape[1:3]) * day_diff) return ds_filt_sub, valid_obs, valid_obs_peryear
[docs]def sel_int_hypsocheat(argsin): """ Multiprocessing wrapper by shapefile feature for "hypsometric cheat" volume integration of elevation time series :param argsin: Tuple with: list of filename of netCDF stacks, shapefile parameters (see hypsocheat_postproc_stacks_tvol), time limits, processing index :returns: Three dataframes of volume change (see hypso_dc in volint_tools.py) """ # integrate volume without having to know the exact spatial disposition: works for hypsometric (needs only reference elevation) # a LOT faster as we don't need to combine (reproject + merge) stacks list_fn_stack_pack, shp_params, tlim, i = argsin print('Integrating volume for outline group: '+str(i+1)) list_ds = st.open_datasets(list_fn_stack_pack,load=True) dates = list_ds[0].time.values list_ds_filt = [] for fn_stack in list_fn_stack_pack: fn_filt = os.path.join(os.path.dirname(fn_stack), os.path.basename(fn_stack).split('_')[0] + '_filtered.nc') ds_filt = xr.open_dataset(fn_filt) ds_filt.load() list_ds_filt.append(ds_filt) fn_shp, feat_id, list_feat_val = shp_params ds_shp = gdal.OpenEx(fn_shp, gdal.OF_VECTOR) layer_name = os.path.splitext(os.path.basename(fn_shp))[0] layer = ds_shp.GetLayer() list_lat, list_lon, list_err_area, list_area = ([] for i in range(4)) for feat_val in list_feat_val: for feature in layer: if feat_val == feature.GetField(feat_id): geom = feature.GetGeometryRef() centroid_lon, centroid_lat, _ = geom.Centroid().GetPoint() list_lon.append(centroid_lon) list_lat.append(centroid_lat) #get the area ratio for a 50 meter buffer proj_in = osr.SpatialReference() proj_in.ImportFromEPSG(4326) perc_area_buff, area = vt.get_buffered_area_ratio(geom,proj_in.ExportToWkt(),15.) list_area.append(area) list_err_area.append(perc_area_buff) layer.ResetReading() df_tot, df_hyp_tot, df_int_tot = (pd.DataFrame() for i in range(3)) for feat_val in list_feat_val: dh, err, ref, dt, count_area, valid, valid_py = ([] for i in range(7)) print('Working on feature ID: '+feat_val) for i, ds in enumerate(list_ds): ds_filt = list_ds_filt[i] #get raster equivalent of stack geoimg = st.make_geoimg(ds, 0) #get mask of latlon tiling tile_name = st.tilename_stack(ds) mask_tile = vt.latlontile_nodatamask(geoimg,tile_name) mask_feat = vt.geoimg_mask_on_feat_shp_ds(ds_shp, geoimg, layer_name=layer_name, feat_id=feat_id, feat_val=feat_val) mask = np.logical_and(mask_tile,mask_feat) if np.count_nonzero(mask) >0: dc, submask, _ = sel_dc(ds,tlim,mask) dc_dt, _, _ = sel_dc(ds_filt,tlim,mask) dc_dt_sub, valid_obs, valid_obs_peryear = get_dt_closest_valid(dc_dt,dates) tmp_dh = dc.z.values[:,submask] - dc.z.values[0,submask] tmp_err = np.sqrt(dc.z_ci.values[:,submask]**2 + dc.z_ci.values[0,submask]**2) tmp_ref = dc.z.values[0,submask] tmp_dt = dc_dt_sub.z.values[:,submask] tmp_valid = valid_obs[:, submask] tmp_valid_peryear = valid_obs_peryear[:,submask] count = np.count_nonzero(mask) #exception! to correct in fit_tools at some point? #when there is a same-date DEM overlapping twice, and those are the only 2 observations over the whole period, the dh is wrongly of "0" and the dt is only NaNs #temporary fix: at_least_2 = np.count_nonzero(~np.isnan(tmp_dt), axis=1) >=2 tmp_dh[~at_least_2, :] = np.nan tmp_err[~at_least_2, :] = np.nan dh.append(tmp_dh) err.append(tmp_err) ref.append(tmp_ref) dt.append(tmp_dt) count_area.append(count) valid.append(tmp_valid) valid_py.append(tmp_valid_peryear) else: continue ds = list_ds[0] x = ds.x.shape[0] dx = np.round((ds.x.max().values - ds.x.min().values) / float(x)) if len(dh)>0 and np.count_nonzero(~np.isnan(np.concatenate(dh,axis=1)))>0: dh = np.concatenate(dh,axis=1) err = np.concatenate(err,axis=1) ref = np.concatenate(ref) dt = np.concatenate(dt,axis=1) valid = np.concatenate(valid,axis=1) valid_py = np.concatenate(valid_py,axis=1) df, df_hyp, df_int = volt.hypso_dc(dh,err,ref,dt,dates,np.ones(np.shape(ref),dtype=bool),dx) df_int['valid_obs'] = np.nanmean(valid,axis=1) df_int['valid_obs_py'] = np.nanmean(valid_py,axis=1) else: area = np.sum(np.array(count_area))*dx**2 df = pd.DataFrame() df = df.assign(hypso=[np.nan], time=[np.nan], dh=[np.nan], err_dh=[np.nan]) df_hyp = pd.DataFrame() df_hyp = df_hyp.assign(hypso=[np.nan], area_meas=[np.nan], area_tot=[area], nmad=[np.nan]) df_int = pd.DataFrame() df_int = df_int.assign(time=dates, dh=[np.nan]*len(dates), err_dh=[np.nan]*len(dates), perc_area_meas=[np.nan]*len(dates), valid_obs=[np.nan]*len(dates),valid_obs_py=[np.nan]*len(dates)) df['rgiid'] = feat_val df_hyp['rgiid'] = feat_val df_int['rgiid'] = feat_val df_int['area'] = list_area[list_feat_val.index(feat_val)] df_int['lon'] = list_lon[list_feat_val.index(feat_val)] df_int['lat'] = list_lat[list_feat_val.index(feat_val)] df_int['perc_err_cont'] = list_err_area[list_feat_val.index(feat_val)] df_tot = df_tot.append(df) df_hyp_tot = df_hyp_tot.append(df_hyp) df_int_tot = df_int_tot.append(df_int) return df_tot, df_hyp_tot, df_int_tot
[docs]def hypsocheat_postproc_stacks_tvol(list_fn_stack, fn_shp, feat_id='RGIId', tlim=None,nproc=1, outfile='int_dh.csv'): """ Hypsometric cheat volume integration of elevation time series The objective of the "cheat" is to highly decrease processing time by allowing to combine parts of shapefile features (e.g., glaciers) which are projected in different georeferencing systems in the stacks (projected in UTM), without having to merge the stacks into the same projection Because integration is hypsometric (based on reference elevation of the glacier), we do not need the information of spatial structure of the glacier :param list_fn_stack: List of filenames of netCDF elevation time series :param fn_shp: Filename of shapefile :param feat_id: Feature name of interest :param tlim: Time limits for integration :param nproc: Number of cores used for multiprocessing [1] :param outfile: Filename of output .csv :returns: """ # get all rgiid intersecting stacks and the list of intersecting stacks start = time.time() all_rgiids, list_list_stacks = inters_feat_shp_stacks(fn_shp, list_fn_stack, feat_id) # sort by rgiid group with same intersecting stacks (avoid loading + combining several time similar groups of stacks) # !!! commenting to speed up things CPU wise for 100 m stacks, but need to uncomment this for higher resolution !!! # all_rgiids = [rgiid for _, rgiid in sorted(zip(list_list_stacks,all_rgiids))] # list_list_stacks = sorted(list_list_stacks) list_tuples = list(zip(all_rgiids, list_list_stacks)) grouped = [(k, list(list(zip(*g))[0])) for k, g in groupby(list_tuples, itemgetter(1))] print('Found '+str(len(all_rgiids))+' outlines intersecting stacks') print('Grouped outlines in '+str(len(grouped))+' packs') print('Elapsed: '+str(time.time()-start)) # loop through similar combination of stacks (that way, only have to combine/open them once) df_final, df_hyp_final, df_int_final = (pd.DataFrame() for i in range(3)) if nproc == 1: for i in range(len(grouped)): list_fn_stack_pack = grouped[i][0] rgiid_pack = grouped[i][1] shp_params = (fn_shp,feat_id,rgiid_pack) df, df_hyp, df_int = sel_int_hypsocheat((list_fn_stack_pack,shp_params,tlim,i)) df_final = df_final.append(df) df_hyp_final = df_hyp_final.append(df_hyp) df_int_final = df_int_final.append(df_int) else: argsin = [(grouped[i][0], (fn_shp, feat_id, grouped[i][1]), tlim, i) for i in range(len(grouped))] pool = mp.Pool(nproc,maxtasksperchild=1) outputs = pool.map(sel_int_hypsocheat,argsin,chunksize=1) pool.close() pool.join() print('Finished processing, elapsed: ' + str(time.time() - start)) zips = list(zip(*outputs)) dfs = zips[0] dfs_hyp = zips[1] dfs_int = zips[2] print('Finished zipping, elapsed: ' + str(time.time() - start)) df_final = pd.concat(dfs) df_hyp_final = pd.concat(dfs_hyp) df_int_final = pd.concat(dfs_int) print('Finished putting in dataframes, elapsed: ' + str(time.time() - start)) fn_csv = os.path.join(os.path.dirname(outfile),os.path.splitext(os.path.basename(outfile))[0]+'_hyp2.csv') fn_hyp_csv = os.path.join(os.path.dirname(outfile),os.path.splitext(os.path.basename(outfile))[0]+'_hyp.csv') fn_int_csv = os.path.join(os.path.dirname(outfile),os.path.splitext(os.path.basename(outfile))[0]+'_int.csv') df_final.to_csv(fn_csv,index=False) df_hyp_final.to_csv(fn_hyp_csv,index=False) df_int_final.to_csv(fn_int_csv,index=False)
[docs]def get_base_df_inventory(dir_shp,outfile): """ Create a "base" dataframe which contains characteristics of RGIIds features (this is done separately to avoid duplicating that info in GBs of data into elevation time series, as those are 2D DataFrames) :param dir_shp: Directory containing RGI shapefiles :param outfile: Output .csv file containing dataframe """ list_fn_shp = glob(os.path.join(dir_shp,'*/*.shp'),recursive=True) df_world = pd.DataFrame() for fn_shp in list_fn_shp: print('Working on region:'+fn_shp) ds_shp = gdal.OpenEx(fn_shp, gdal.OF_VECTOR) layer = ds_shp.GetLayer() list_rgiid, list_cl, list_nom, list_reg, list_term, list_area, list_lon, list_lat = ([] for i in range(8)) for feature in layer: rgiid = feature.GetField('RGIId') geom = feature.GetGeometryRef() centroid_lon, centroid_lat, _ = geom.Centroid().GetPoint() proj_in = osr.SpatialReference() proj_in.ImportFromEPSG(4326) _, area = vt.get_buffered_area_ratio(geom, proj_in.ExportToWkt(), 15.) print('Glacier: '+rgiid) if rgiid[0] != 'G': reg = int(rgiid[6:8]) cl = feature.GetField('Connect') nom = feature.GetField('Status') term = feature.GetField('TermType') else: reg = 12 cl = 0 nom = 0 term = 0 list_rgiid.append(rgiid) list_cl.append(cl) list_nom.append(nom) list_reg.append(reg) list_term.append(term) list_area.append(area) list_lon.append(centroid_lon) list_lat.append(centroid_lat) df_reg = pd.DataFrame() df_reg = df_reg.assign(rgiid=list_rgiid,cl=list_cl,nom=list_nom,reg=list_reg,term=list_term,area=list_area,lon=list_lon,lat=list_lat) df_world = df_world.append(df_reg) df_world.to_csv(outfile)
[docs]def add_base_to_int(df,fn_base,reg): """ Use "base" RGI file to check whether all glaciers are covered before aggregation, and to remove nominal glaciers and CL2 glacier estimates :param df: DataFrame of integrated volume time series :param fn_base: Filename of "base" RGI file :param reg: RGI region number: 20 corresponds to Alaska and WNA combined, 21 corresponds to HMA combined (necessary for some uncertainty calculations as those are contiguous) :returns: DataFrame of integrated volume time series with nominal, CL2 glaciers removed and not-covered glaciers added with nodata """ # if we can, help ourselves with pre-compiled data for all glaciers df_base = pd.read_csv(fn_base) # keep only glaciers from the region if reg == 20: ind = np.logical_or(df_base.reg == 1, df_base.reg == 2) elif reg == 21: ind = np.logical_or.reduce((df_base.reg == 13, df_base.reg == 14, df_base.reg == 15)) else: ind = df_base.reg == reg df_base_reg = df_base[ind] # add glaciers completely omitted in this region (no data at all in the tile thus no stack: only concerns 2 tiles in Antarctica?) proc_rgiid = list(set(list(df.rgiid))) omit_rgiid = [rgiid for rgiid in list(df_base_reg.rgiid) if rgiid not in proc_rgiid] print('Added ' + str(len(omit_rgiid)) + ' omitted glaciers.') dates = list(set(list(df.time))) for rgiid in omit_rgiid: df_tmp = pd.DataFrame() df_tmp = df_tmp.assign(area=[df_base_reg[df_base_reg.rgiid == rgiid].area.values[0]] * len(dates)) df_tmp['time'] = dates df_tmp['rgiid'] = rgiid df_tmp['dh'] = np.nan df = df.append(df_tmp) # remove nominal glaciers (irrelevant outlines: only North Asia, Scandinavia and Alps) ind_nom = df_base_reg.nom == 2 nom_rgiid = list(df_base_reg.rgiid[ind_nom]) for rgiid in nom_rgiid: df.loc[df.rgiid == rgiid,'dh'] = np.nan print('Removed elevation change data for ' + str(len(nom_rgiid)) + ' nominal glaciers.') # remove CL2 glaciers (Greenland) ind_cl2 = df_base_reg.cl == 2 cl2_rgiid = list(df_base_reg.rgiid[ind_cl2]) for rgiid in cl2_rgiid: df.loc[df.rgiid == rgiid,'area'] = np.nan print('Removed contributing area for ' + str(len(cl2_rgiid)) + ' CL2 glaciers.') return df
[docs]def aggregate_int_to_all(df,nproc=1,get_corr_err=True): """ Aggregate spatially per-glaciers volume changes time series and propagate errors :param df: DataFrame of integrated volume time series :param nproc: Number of cores used for multiprocessing [1] :param get_corr_err: Derive correlated error (long): currently forced to annual only, because seasonal biases are not accounted for and can be complex :returns: DataFrame of aggregated volume time series """ # get area of glacier without any data to account for it later valid = np.logical_and.reduce((~np.isnan(df.dh), ~np.isnan(df.area),~np.isnan(df.err_dh))) df_nodata = df[~valid] rgiid_nodata = list(set(list(df_nodata.rgiid))) area_nodata = 0 for rgiid in rgiid_nodata: area_missed = df_nodata[df_nodata.rgiid == rgiid].area.values[0] if ~np.isnan(area_missed): area_nodata += area_missed # keep only valid glaciers print('Removed '+str(len(rgiid_nodata))+' non-valid glaciers with total area of '+str(area_nodata/1000000.)+' km².') print(rgiid_nodata) reg = df.reg.values[0] df = df[valid] # get the regional volume df['dvol'] = df['dh'] * df['area'] df['var_cont'] = df['perc_err_cont'] * df['area'] df['area_res'] = df['area'] * df['perc_area_res'] df['area_meas'] = df['area'] * df['perc_area_meas'] df['area_valid_obs'] = df['area'] * df['valid_obs'] df['area_valid_obs_py'] = df['area'] * df['valid_obs_py'] df_tot = df.groupby('time')['dvol', 'area', 'var_cont', 'area_meas', 'area_res', 'area_valid_obs','area_valid_obs_py'].sum() # propagate volume change accouting for area of no data glaciers df_tot['dvol'] *= (df_tot['area'] + area_nodata) / df_tot['area'] df_tot['perc_err_cont'] = df_tot['var_cont'] / df_tot['area'] df_tot['valid_obs'] = df_tot['area_valid_obs'] / df_tot['area'] df_tot['valid_obs_py'] = df_tot['area_valid_obs_py'] / df_tot['area'] df_tot['area'] += area_nodata df_tot['perc_area_meas'] = df_tot['area_meas'] / df_tot['area'] df_tot['perc_area_res'] = df_tot['area_res'] / df_tot['area'] df_tot['dh'] = df_tot['dvol'] / df_tot['area'] df_tot['area_nodata'] = area_nodata df_tot['reg'] = reg df_tot = df_tot.drop(columns=['var_cont', 'area_meas', 'area_res', 'area_valid_obs']) # integrate elevation change error accouting for spatial correlation of temporal interpolation between glaciers if np.count_nonzero(valid) == 0: return df_tot # get error only for annual tlim = [np.datetime64(str(2000 + i) + '-01-01') for i in range(21)] times = sorted(list(set(list(df['time'])))) df_time = pd.DataFrame() df_time = df_time.assign(time=times) df_time.index = pd.DatetimeIndex(pd.to_datetime(times)) closest_dates = [] for tl in tlim: time_clos = df_time.iloc[df_time.index.get_loc(pd.to_datetime(tl), method='nearest')][0] closest_dates.append(time_clos) # int_err = [] df_tot['err_dh'] = np.nan if get_corr_err: for time in closest_dates: print('Time step: ' + time) df_tmp = df[df.time == time] corr_ranges = [150, 2000, 5000, 20000, 50000, 200000] list_tuple_errs = list( zip(*[df_tmp['err_corr_' + str(corr_ranges[i])].values for i in range(len(corr_ranges))])) list_area_tot = df_tmp.area.values list_lat = df_tmp.lat.values list_lon = df_tmp.lon.values err = spt.double_sum_covar(list_tuple_errs, corr_ranges, list_area_tot, list_lat, list_lon, nproc=nproc) df_tot.loc[df_tot.index == time,'err_dh'] = err # int_err = np.array(int_err) # df_tot['err_dh'] = int_err # error on general inventory, uncharted glaciers ; 3 percent here # df_tot['err_area'] = 3./100.*df_tot['area'] # propagate error to volume change # df_tot['err_dvol'] = np.sqrt((df_tot['err_dh']*df_tot['area'])**2 + (df_tot['dh']*np.sqrt(df_tot['err_area']**2+df_tot['err_cont']**2))**2) df_tot['err_dvol'] = np.sqrt((df_tot['err_dh'] * df_tot['area']) ** 2 + ( df_tot['dh'] * df_tot['perc_err_cont'] / 100. * df_tot['area']) ** 2) # convert to mass change (Huss, 2013) df_tot['dm'] = df_tot['dvol'] * 0.85 / 10 ** 9 # propagate error to mass change (Huss, 2013) df_tot['err_dm'] = np.sqrt((df_tot['err_dvol'] * 0.85 / 10 ** 9) ** 2 + ( df_tot['dvol'] * 0.06 / 10 ** 9) ** 2) return df_tot
[docs]def df_int_to_base(infile,fn_base=None): """ Wrapper for adding base RGI info to DataFrame per glacier to all regions separately and write to .csv :param infile: DataFrame of integrated volume time series :param fn_base: Filename of "base" RGI file :returns: """ df_init = pd.read_csv(infile) print('Working on ' + infile) region = int(os.path.basename(infile).split('_')[1]) # to integrate properly regions 1, 2, 13, 14, 15 if region == 1: list_reg = [20, 1, 2] elif region == 13: list_reg = [21, 13, 14, 15] else: list_reg = [region] for reg in list_reg: print('Processing for region ' + str(reg)) if reg > 19: fn_base_out = os.path.join(os.path.dirname(infile), os.path.splitext(os.path.basename(infile))[0] + '_base.csv') df = df_init else: fn_base_out = os.path.join(os.path.dirname(infile), 'dh_' + str(reg).zfill(2) + '_rgi60_int_base.csv') if reg in [1, 2, 13, 14, 15]: ind = np.array(['RGI60-' + str(reg).zfill(2) in rgiid for rgiid in list(df_init.rgiid.values)]) df = df_init[ind] else: df = df_init df['reg'] = reg if fn_base is not None: df = add_base_to_int(df, fn_base, reg) # if not os.path.exists(fn_base_out): df.to_csv(fn_base_out)
[docs]def df_int_to_reg(infile,nproc=1): """ Wrapper for integrating volume change time series per region (based on RGI number) and write to .csv :param infile: DataFrame of integrated volume time series :param nproc: Number of cores used for multiprocessing [1] :returns: """ df = pd.read_csv(infile) fn_reg_out = os.path.join(os.path.dirname(infile), os.path.splitext(os.path.basename(infile))[0] + '_reg.csv') #let's go df_tot = aggregate_int_to_all(df,nproc=nproc) df_tot.to_csv(fn_reg_out)
[docs]def aggregate_all_to_period(df,list_tlim=None,mult_ann=None,fn_tarea=None,frac_area=None,list_tag=None): """ Temporal integration of volume change time series into rates for a specific period and error propagation For regions, we can account for time-varying areas by approximated per-region annual areas (Zemp et al. (2019)) :param df: DataFrame of volume time series :param list_tlim: List of tuples or early and late date to integrate over :param mult_ann: Ignore tuples, integrate over all successive multi-annual periods of this length (e.g., four successive 5-year rates) :param fn_tarea: Filename of DataFrame with temporally-varying areas :param frac_area: Use only a corresponding fraction of the regional area to estimate time-varying areas (e.g., subset of Greenland, or other) :param list_tag: Tag a naming to each integration period :returns: DataFrame of volume change rates for all successive periods fitting the year length """ if np.count_nonzero(~np.isnan(df.dh))==0: df_mult_ann = pd.DataFrame() return df_mult_ann if list_tlim is None and mult_ann is None: print('Exiting: need to provide either a time interval (tlim) or a year length (mult_ann)') if mult_ann is not None: print('Multi-annual: ignoring list of time periods') nb_period = int(np.floor(20 / mult_ann)) list_tlim = [(np.datetime64(str(2000 + mult_ann * i) + '-01-01'),np.datetime64(str(2000 + mult_ann * (i+1)) + '-01-01')) for i in range(nb_period)] list_closest_dates = [] times = sorted(list(set(list(df['time'])))) df_time = pd.DataFrame() df_time = df_time.assign(time=times) df_time.index = pd.DatetimeIndex(pd.to_datetime(times)) for tl in list_tlim: early_time_clos = df_time.iloc[df_time.index.get_loc(pd.to_datetime(tl[0]), method='nearest')][0] late_time_clos = df_time.iloc[df_time.index.get_loc(pd.to_datetime(tl[1]), method='nearest')][0] list_closest_dates.append((early_time_clos,late_time_clos)) reg = df.reg.values[0] if fn_tarea is not None: df_tarea = pd.read_csv(fn_tarea) # get regionally evolving areas if reg == 20: tmp_tarea = df_tarea['RGI1'].values + df_tarea['RGI2'].values elif reg == 21: tmp_tarea = df_tarea['RGI13'].values + df_tarea['RGI14'].values + df_tarea['RGI15'].values else: tmp_tarea = df_tarea['RGI' + str(int(reg))].values if frac_area is not None: tmp_tarea = frac_area * tmp_tarea #list corresponding to tuples of early and late date (to later derive mean area of two periods) list_tarea = [] for i in range(len(list_tlim)): # getting years 2000 to 2020 ind = df_tarea['YEAR'] == list_tlim[i][0].astype(object).year early_tarea = tmp_tarea[ind][0] * 1000000 ind = df_tarea['YEAR'] == list_tlim[i][1].astype(object).year late_tarea = tmp_tarea[ind][0] * 1000000 list_tarea.append((early_tarea,late_tarea)) else: list_tarea =[(df.area.values[0],df.area.values[0]) for i in range(len(list_tlim))] list_midtarea, list_dhdt, list_err_dhdt, list_dvoldt, list_err_dvoldt, list_dmdt, list_err_dmdt, list_valid_obs, list_dt, list_valid_obs_py, list_dmdtda, list_err_dmdtda = ( [] for i in range(12)) for i in range(len(list_tlim)): # derive volume change for subperiod tlim = list_tlim[i] mult_ann = tlim[1].astype(object).year - tlim[0].astype(object).year area = df.area.values[0] dvol = (df[df.time == list_closest_dates[i][1]].dvol.values - df[df.time == list_closest_dates[i][0]].dvol.values)[0] dh = dvol / area err_dh = np.sqrt( df[df.time == list_closest_dates[i][1]].err_dh.values[0] ** 2 + df[df.time == list_closest_dates[i][0]].err_dh.values[ 0] ** 2) err_dvol = np.sqrt((err_dh * area) ** 2 + (dh * df.perc_err_cont.values[0] / 100. * area) ** 2) dvoldt = dvol / mult_ann err_dvoldt = err_dvol / mult_ann dmdt = dvol * 0.85 / 10 ** 9 / mult_ann err_dmdt = np.sqrt((err_dvol * 0.85 / 10 ** 9) ** 2 + ( dvol * 0.06 / 10 ** 9) ** 2) / mult_ann linear_area = (list_tarea[i][0] + list_tarea[i][1]) / 2 dhdt = dvol / linear_area / mult_ann perc_err_linear_area = 1. / 100 err_dhdt = np.sqrt((err_dvol / linear_area) ** 2 \ + (perc_err_linear_area * linear_area * dvol / linear_area ** 2) ** 2) / mult_ann dmdtda = dmdt / linear_area * 10**9 err_dmdtda = np.sqrt((err_dmdt * 10**9 / linear_area) ** 2 \ + (perc_err_linear_area * linear_area * dmdt * 10**9/ linear_area ** 2) ** 2) ind = np.logical_and(df.time >= list_closest_dates[i][0], df.time < list_closest_dates[i][1]) valid_obs = np.nansum(df.valid_obs[ind].values) valid_obs_py = np.nansum(df.valid_obs_py[ind].values) list_midtarea.append(linear_area) list_dhdt.append(dhdt) list_err_dhdt.append(err_dhdt) list_dvoldt.append(dvoldt) list_err_dvoldt.append(err_dvoldt) list_dmdt.append(dmdt) list_err_dmdt.append(err_dmdt) list_dmdtda.append(dmdtda) list_err_dmdtda.append(err_dmdtda) list_dt.append(str(tlim[0]) + '_' + str(tlim[1])) list_valid_obs.append(valid_obs) list_valid_obs_py.append(valid_obs_py) df_mult_ann = pd.DataFrame() df_mult_ann = df_mult_ann.assign(period=list_dt, tarea=list_midtarea, dhdt=list_dhdt, err_dhdt=list_err_dhdt, dvoldt=list_dvoldt, err_dvoldt=list_err_dvoldt, dmdt=list_dmdt, err_dmdt=list_err_dmdt, dmdtda=list_dmdtda,err_dmdtda=list_err_dmdtda,valid_obs=list_valid_obs, valid_obs_py=list_valid_obs_py) if list_tag is not None: df_mult_ann['tag'] = list_tag df_mult_ann['reg'] = reg df_mult_ann['perc_area_meas'] = df.perc_area_meas.values[0] df_mult_ann['perc_area_res'] = df.perc_area_res.values[0] df_mult_ann['area_nodata'] = df.area_nodata.values[0] df_mult_ann['area'] = df.area.values[0] return df_mult_ann
[docs]def aggregate_indep_regions(df_p): """ Aggregate regions with independent assumptions for uncertainty propagation :param df_p: :return: """ # GLOBAL TOTAL area_global = np.nansum(df_p.area.values) area_nodata_global = np.nansum(df_p.area_nodata.values) tarea_global = np.nansum(df_p.tarea.values) dmdt_global = np.nansum(df_p.dmdt.values) err_dmdt_global = np.sqrt(np.nansum(df_p.err_dmdt.values ** 2)) dvoldt_global = np.nansum(df_p.dvoldt.values) err_dvoldt_global = np.sqrt(np.nansum(df_p.err_dvoldt.values ** 2)) err_tarea_global = np.sqrt(np.nansum((1 / 100. * df_p.tarea.values) ** 2)) dhdt_global = np.nansum(df_p.dvoldt.values) / tarea_global err_dhdt_global = np.sqrt( (err_dvoldt_global / tarea_global) ** 2 + (err_tarea_global * dvoldt_global / (tarea_global ** 2)) ** 2) dmdtda_global = dmdt_global * 10**9/tarea_global err_dmdtda_global = np.sqrt( (err_dmdt_global *10**9/ tarea_global) ** 2 + (err_tarea_global * dmdt_global*10**9 / (tarea_global ** 2)) ** 2) perc_area_res_global = np.nansum(df_p.perc_area_res.values * df_p.area.values) / np.nansum( df_p.perc_area_res.values) perc_area_meas_global = np.nansum(df_p.perc_area_meas.values * df_p.area.values) / np.nansum( df_p.perc_area_meas.values) valid_obs_global = np.nansum(df_p.valid_obs.values * df_p.area.values) / np.nansum(df_p.perc_area_res.values) valid_obs_py_global = np.nansum(df_p.valid_obs_py.values * df_p.area.values) / np.nansum(df_p.perc_area_res.values) df_sum = pd.DataFrame() df_sum = df_sum.assign(area=[area_global],area_nodata=[area_nodata_global],dhdt=[dhdt_global],err_dhdt=[err_dhdt_global],dvoldt=[dvoldt_global],err_dvoldt=[err_dvoldt_global] ,dmdt=[dmdt_global],err_dmdt=[err_dmdt_global],dmdtda=[dmdtda_global],err_dmdtda=[err_dmdtda_global],tarea=[tarea_global],perc_area_res=[perc_area_res_global],perc_area_meas=[perc_area_meas_global],valid_obs=[valid_obs_global] ,valid_obs_py=[valid_obs_py_global]) return df_sum
[docs]def df_region_to_multann(infile_reg, fn_tarea=None, frac_area=None): """ Wrapper for temporal integration of regional volume change time series for all possible 1-,2-,4-,5-,10- and 20-year periods and write to .csv :param infile_reg: DataFrame of regional, integrated volume time series :param fn_tarea: Filename of DataFrame with temporally-varying areas :param frac_area: Use only a corresponding fraction of the regional area to estimate time-varying areas (e.g., subset of Greenland, or other) :returns: """ df = pd.read_csv(infile_reg) fn_out = os.path.join(os.path.dirname(infile_reg), os.path.splitext(os.path.basename(infile_reg))[0] + '_subperiods.csv') list_df = [] for mult_ann in [1,2,4,5,10,20]: df_mult_ann = aggregate_all_to_period(df,mult_ann=mult_ann,fn_tarea=fn_tarea,frac_area=frac_area) list_df.append(df_mult_ann) df_final = pd.concat(list_df) df_final.to_csv(fn_out)
def wrapper_tile_rgiids(argsin): list_tile, df_base, uniq_rgiids, i, itot = argsin print('Working on pack: '+str(i)+' out of '+str(itot)) tmp_rgiids = [] for j, tile in enumerate(list_tile): print('Tile: '+str(j)+' out of '+str(len(list_tile))) ind_base = np.logical_and.reduce((df_base.lat >= tile[1], df_base.lat < tile[3], df_base.lon >= tile[0], df_base.lon < tile[2])) rgiids_base = list(df_base[ind_base].rgiid) if len(rgiids_base)==0: tmp_rgiids.append(['NA']) print('No base glacier in this tile, skipping...') continue #this is only necessary is the few void glaciers have not been added to the dataframe post volume integ rgiids = [rgiid for rgiid in rgiids_base if rgiid in uniq_rgiids] if len(rgiids) == 0: tmp_rgiids.append(['NA']) print('No glacier in this tile, skipping...') continue tmp_rgiids.append(rgiids) return tmp_rgiids def wrapper_tile_int_to_all_to_mult_ann(argsin): df_tile, tile, i, imax = argsin # df_tile = df_keep[df_keep.rgiid.isin(rgiids_final)] print('Working on tile '+str(tile)+': '+str(i)+' out of '+str(imax)) df_agg = aggregate_int_to_all(df_tile, nproc=1) df_agg['time'] = df_agg.index.values list_df_mult = [] for mult_ann in [1, 2, 4, 5, 10, 20]: #TODO: for specific rates at smaller scales than region, we don't assume time-varying areas yet... somewhat inconsistent df_mult = aggregate_all_to_period(df_agg, mult_ann=mult_ann) list_df_mult.append(df_mult) df_mult_all = pd.concat(list_df_mult) df_mult_all['tile_lonmin'] = tile[0] df_mult_all['tile_latmin'] = tile[1] return df_mult_all
[docs]def df_all_base_to_tile(list_fn_int_base,fn_base,tile_size=1,nproc=1,sort_tw=True): """ Integrate all per-glacier volume change time series on a global tiling of certain size, for all possible 1-,2-,4-,5-,10- and 20-year periods, with error propagation :param list_fn_int_base: List of filename of DataFrame of per-glacier volume change time series with base RGIID info added :param fn_base: Filename of "base" RGI file :param tile_size: Size of tiling in lat/lon degrees :param nproc: Number of cores used for multiprocessing [1] :param sort_tw: Sort between marine-terminating and land-terminating glaciers :returns: """ fn_out = os.path.join(os.path.dirname(list_fn_int_base[0]), 'dh_world_tiles_' + str(tile_size) + 'deg.csv') def world_latlon_tiles(deg_size): list_tile = [] for ymin in np.arange(-90, 90, deg_size): for xmin in np.arange(-180, 180, deg_size): list_tile.append(np.array([xmin, ymin, xmin + deg_size, ymin + deg_size], dtype=float)) return list_tile list_tile = world_latlon_tiles(tile_size) # list_tile = [np.array([-180,70,-179,71],dtype=float),np.array([-18,64,-17,65],dtype=float),np.array([-17,64,-16,65],dtype=float)] df_base = pd.read_csv(fn_base) if sort_tw: ind_tw = df_base.term == 1 keep_tw = list(df_base.rgiid[ind_tw]) keep_ntw = list(df_base.rgiid[~ind_tw]) list_keeps = [keep_tw,keep_ntw,list(df_base.rgiid)] name_keeps = ['tw','ntw','all'] else: list_keeps=[list(df_base.rgiid)] name_keeps=['all'] list_df_final = [] # list_dfs = [] for fn_int_base in list_fn_int_base: df = pd.read_csv(fn_int_base) # list_dfs.append(pd.read_csv(fn_int_base)) # df = pd.concat(list_dfs) for keep in list_keeps: print('Processing for category: '+name_keeps[list_keeps.index(keep)]) df_keep=df[df.rgiid.isin(keep)] if len(df_keep)==0: continue r = int(os.path.basename(fn_int_base).split('_')[1]) if r == 1: list_reg = [1,2] elif r == 13: list_reg = [13,14,15] else: list_reg = [r] list_ext_reg = [] # list_reg = list(set(list(df_base.reg))) print('Deriving region extents for tile omission...') for reg in list_reg: print('Region: '+str(reg)) df_tmp = df_base[df_base.reg==reg] list_ext_reg.append(np.array([df_tmp.lon.min(),df_tmp.lat.min(),df_tmp.lon.max(),df_tmp.lat.max()],dtype=float)) uniq_rgiids = list(set(list(df_keep.rgiid))) print('Removing useless tiles') list_tile_possib = [] for tile in list_tile: # to speed things up, let's remove all tiles outside region extents poly_tile = vt.poly_from_extent(tile) chk = 0 for reg in list_reg: poly = vt.poly_from_extent(list_ext_reg[list_reg.index(reg)]) if poly.Intersects(poly_tile): chk = 1 if chk != 0: list_tile_possib.append(tile) print('Finding which tiles have to be processed...') if nproc==1: print('Using 1 core...') list_rgiids = [] for i in range(len(list_tile_possib)): tmp_rgiids = wrapper_tile_rgiids((list_tile_possib[i], df_base, uniq_rgiids, i, len(list_tile_possib))) list_rgiids.append(tmp_rgiids) else: print('Using '+str(nproc)+' cores...') pack_size = int(np.ceil(len(list_tile_possib) / nproc)) argsin = [(list_tile_possib[i:min(i+pack_size,len(list_tile_possib))], df_base, uniq_rgiids, k, nproc) for k ,i in enumerate(np.arange(0,len(list_tile_possib),pack_size))] pool = mp.Pool(nproc, maxtasksperchild=1) outputs = pool.map(wrapper_tile_rgiids, argsin, chunksize=1) pool.close() pool.join() list_rgiids = list(itertools.chain(*outputs)) list_df_tile = [] list_tile_final = [] # list_rgiids_final = [] for i, rgiids in enumerate(list_rgiids): if len(rgiids)==1 and rgiids[0]=='NA': continue # list_rgiids_final.append(rgiids) df_tile = df_keep[df_keep.rgiid.isin(rgiids)] list_df_tile.append(df_tile) list_tile_final.append(list_tile_possib[i]) print('Found '+str(len(list_tile_final))+' valid tiles to integrate.') print('Integrating...') if nproc == 1: print('Using 1 core...') list_mult_all = [] for i, tile_final in enumerate(list_tile_final): df_mult_all = wrapper_tile_int_to_all_to_mult_ann((list_df_tile[i],tile_final,i,len(list_tile_final))) list_mult_all.append(df_mult_all) else: print('Using '+str(nproc)+' cores...') argsin = [(list_df_tile[i],list_tile_final[i],i,len(list_tile_final)) for i in range(len(list_tile_final))] pool = mp.Pool(nproc, maxtasksperchild=1) outputs = pool.map(wrapper_tile_int_to_all_to_mult_ann, argsin, chunksize=1) pool.close() pool.join() list_mult_all = outputs if len(list_mult_all)>0: df_tiles = pd.concat(list_mult_all) df_tiles['tile_size'] = tile_size df_tiles['category'] = name_keeps[list_keeps.index(keep)] df_tiles['reg'] = os.path.basename(fn_int_base).split('_')[1] list_df_final.append(df_tiles) df_cat = pd.concat(list_df_final) df_cat.to_csv(fn_out)
[docs]def aggregate_df_int_time(infile,tlim=None,rate=False): """ Temporal integration over any period, choice between cumulative or rate :param infile: Filename of DataFrame with volume change time series :param tlim: Time limits for integration :param rate: Boolean, add rates :returns: """ df = pd.read_csv(infile) #not the cleanest closest time search you can write, but works times = sorted(list(set(list(df['time'])))) df_time = pd.DataFrame() df_time = df_time.assign(time=times) df_time.index = pd.DatetimeIndex(pd.to_datetime(times)) time_start = df_time.iloc[df_time.index.get_loc(pd.to_datetime(tlim[0]),method='nearest')][0] time_end = df_time.iloc[df_time.index.get_loc(pd.to_datetime(tlim[1]),method='nearest')][0] df = df.sort_values(by='rgiid') df_start = df[df.time == time_start] df_end = df[df.time == time_end] ind = np.logical_and(df.time >= time_start, df.time < time_end) df_period = df[ind] df_period_cum = df_period.groupby('rgiid')['valid_obs',].sum() df_gla = pd.DataFrame() df_gla['rgiid'] = df_start['rgiid'] df_gla['valid_obs'] = df_period_cum.valid_obs.values df_gla['period'] = str(time_start)+'_'+str(time_end) df_gla['area'] = df_start['area'] df_gla['perc_area_meas'] = df_start['perc_area_meas'] df_gla['lat'] = df_start['lat'] df_gla['lon'] = df_start['lon'] df_gla['dh'] = df_end['dh'].values - df_start['dh'].values df_gla['err_dh'] = np.sqrt(df_end['err_dh'].values**2+df_start['err_dh'].values**2) df_gla['err_cont'] = df_start['perc_err_cont'] * df_start['area'] / 100. df_gla['perc_err_cont']=df_start['perc_err_cont'] #get the volume per glacier df_gla['dvol'] = df_gla['dh'] * df_gla['area'] #correct systematic seasonal biases (snow-covered terrain) # df_tot['dh'] = #propagate error to volume change df_gla['err_dvol'] = np.sqrt((df_gla['err_dh'] * df_gla['area']) ** 2 + ( df_gla['dh'] * df_gla['perc_err_cont'] / 100. * df_gla['area']) ** 2) #convert to mass change (Huss, 2013) df_gla['dm'] = df_gla['dvol'] * 0.85 / 10 ** 9 #propagate error to mass change (Huss, 2013) df_gla['err_dm'] = np.sqrt((df_gla['err_dvol'] * 0.85 / 10 ** 9) ** 2 + ( df_gla['dvol'] * 0.06 / 10 ** 9) ** 2) df_gla['dmda'] = df_gla['dm'].values / df_gla['area'].values * 10 ** 9 df_gla['err_dmda'] = np.sqrt((df_gla['err_dm'].values * 10 ** 9 / df_gla['area'].values) ** 2 \ + (df_gla['err_cont'].values * df_gla['dm'].values * 10 ** 9 / df_gla['area'].values ** 2) ** 2) if rate: dt = (tlim[1]-tlim[0]).astype(int)/365.2524 df_gla['dmdtda'] = df_gla['dmda'] / dt df_gla['err_dmdtda'] = df_gla['err_dmda'] /dt df_gla['dhdt'] = df_gla['dh']/dt df_gla['err_dhdt'] = df_gla['err_dh'] /dt df_gla = df_gla.drop(columns=['dvol', 'err_dvol', 'dm', 'err_dm','dh','err_dh','dmda','err_dmda']) else: df_gla = df_gla.drop(columns=['dvol', 'err_dvol', 'dm', 'err_dm','dh','err_dh']) df_gla['area'] = df_gla['area']/1000000 df_gla['reg'] = df_start['reg'] return df_gla
def aggregate_int_to_shp(df,fn_shp,list_tlim=None,mult_ann=None,field_name='FULL_NAME',code_name='RGI_CODE',sort_tw=False,fn_base=None,fn_tarea=None): ds_shp_in = ogr.GetDriverByName('ESRI Shapefile').Open(fn_shp, 0) layer_in = ds_shp_in.GetLayer() if fn_base is not None: df_base = pd.read_csv(fn_base) list_df_all = [] for feature in layer_in: geom = feature.GetGeometryRef() region = feature.GetField(field_name) rgi_code = feature.GetField(code_name) centroid = geom.Centroid() center_lon, center_lat, _ = centroid.GetPoint() # list_rgiid = list(set(list(df.rgiid))) # keep only points in polygon df_rgiid = df.groupby('rgiid')['lat', 'lon'].mean() df_rgiid['rgiid'] = df_rgiid.index list_subreg_rgiid = [] for i in np.arange(len(df_rgiid)): print('Working on glacier ' + str(i + 1) + ' out of ' + str(len(df_rgiid))) lat = df_rgiid.lat.values[i] lon = df_rgiid.lon.values[i] point = ogr.Geometry(ogr.wkbPoint) point.AddPoint(lon, lat) if point.Intersects(geom): list_subreg_rgiid.append(df_rgiid.rgiid.values[i]) if sort_tw: ind_tw = np.logical_or(df_base.term == 1, df_base.term == 5) keep_tw = list(df_base.rgiid[ind_tw]) keep_ntw = list(df_base.rgiid[~ind_tw]) list_keeps = [keep_tw, keep_ntw, list(df_base.rgiid)] name_keeps = ['tw', 'ntw', 'all'] else: list_keeps = [list_subreg_rgiid] name_keeps = ['all'] for keeps in list_keeps: subset = [rgiid for rgiid in list_subreg_rgiid if rgiid in keeps] df_subreg_int = df[df.rgiid.isin(subset)] if len(df_subreg_int) == 0: continue df_subreg_reg = aggregate_int_to_all(df_subreg_int, get_corr_err=True) df_subreg_reg['time'] = df_subreg_reg.index.values if len(df_subreg_reg) == 0: continue subreg_area = df_subreg_reg.area.values[0] # frac_area = subreg_area / regional_area frac_area = None list_df_mult = [] for mult_ann in [1, 2, 4, 5, 10, 20]: df_mult = aggregate_all_to_period(df_subreg_reg, list_tlim=list_tlim, mult_ann=mult_ann, fn_tarea=fn_tarea, frac_area=frac_area) list_df_mult.append(df_mult) df_mult_all = pd.concat(list_df_mult) df_mult_all['subreg'] = region df_mult_all['lon_reg'] = center_lon df_mult_all['lat_reg'] = center_lat df_mult_all['category'] = name_keeps[list_keeps.index(keeps)] df_mult_all['frac_area'] = frac_area list_df_all.append(df_mult_all) df_in_shp = pd.concat(list_df_all) return df_in_shp