"""
pymmaster.tdem_tools provides tools to post-process DEM stacks: volume integration over specific outlines, comparison to point data, spatial aggregation...
"""
import xarray as xr
import os
import sys
import shutil
import numpy as np
from itertools import groupby
from operator import itemgetter
import multiprocessing as mp
from scipy.interpolate import interp1d
import gdal
import osr
import ogr
import pyproj
import time
from datetime import datetime as dt
import pandas as pd
import pyddem.stack_tools as st
import pyddem.vector_tools as vt
import pyddem.fit_tools as ft
import pyddem.spstats_tools as spt
import pyddem.volint_tools as volt
from pybob.coreg_tools import get_slope, create_stable_mask, dem_coregistration
from pybob.GeoImg import GeoImg
from pybob.ICESat import ICESat
from glob import glob
import itertools
[docs]def inters_feat_shp_stacks(fn_shp, list_fn_stack, feat_field_name):
"""
Find all intersecting stacks for features of a shapefile, for a list of netCDF stacks (tiles for example)
:param fn_shp: Filename of shapefile
:param list_fn_stack: List of filenames of netCDF stacks
:param feat_field_name: Name of shapefile feature to sort by (e.g., RGIIDs for RGI glaciers)
:returns: List of all features found, Corresponding list of intersecting stacks per feature
"""
# get intersecting rgiid for each stack extent
list_list_rgiid = []
for fn_stack in list_fn_stack:
#TODO: temporary test
# ds = xr.open_dataset(fn_stack)
# extent, proj = st.extent_stack(ds)
ds = xr.open_dataset(fn_stack)
tile_name = st.tilename_stack(ds)
lat, lon = vt.SRTMGL1_naming_to_latlon(tile_name)
extent = [lon, lat, lon+1, lat+1]
proj = osr.SpatialReference()
proj.ImportFromEPSG(4326)
list_rgiid = vt.list_shp_field_inters_extent(fn_shp, feat_field_name, extent, proj.ExportToWkt())
list_list_rgiid.append(list_rgiid)
# get all rgiids intersecting all stacks without duplicates
all_rgiid = []
for list_rgiid in list_list_rgiid:
all_rgiid = all_rgiid + list_rgiid
all_rgiid = list(set(all_rgiid))
# inverting to have intersecting stacks by rgiid: more practical to compute that way
list_list_stack_by_rgiid = []
for rgiid in all_rgiid:
list_stacks = []
for fn_stack in list_fn_stack:
if rgiid in list_list_rgiid[list_fn_stack.index(fn_stack)]:
list_stacks.append(fn_stack)
list_list_stack_by_rgiid.append(sorted(list_stacks))
return all_rgiid, list_list_stack_by_rgiid
[docs]def sel_dc(ds, tlim, mask):
"""
Subset datacube in space and time with a spatial mask and time limits
:param ds: xarray Dataset
:param tlim: Time limits
:param mask: Raster boolean mask
:returns: Subset Dataset, Subset boolean mask, Spatial index slices (to identify subset position in the original datacube, for instance)
"""
# select data cube on temporal and spatial mask
if tlim is None:
time1 = time2 = None
else:
time1, time2 = tlim
index = np.where(mask)
minx = ds.x[np.min(index[1])].values
maxx = ds.x[np.max(index[1])].values
miny = ds.y[np.min(index[0])].values
maxy = ds.y[np.max(index[0])].values
dc = ds.sel(dict(time=slice(time1, time2), x=slice(minx, maxx), y=slice(miny, maxy)))
submask = mask[np.min(index[0]):np.max(index[0] + 1), np.min(index[1]):np.max(index[1] + 1)]
return dc, submask, (slice(minx,maxx),slice(miny,maxy))
[docs]def int_dc(dc, mask, **kwargs):
"""
Wrapper to for classic integration of elevation time series into volume change time series
:param dc: xarray Dataset (subset)
:param mask: Raster boolean mask
:returns: DataFrame of integrated elevation change
"""
# integrate data cube over masked area
dh = dc.variables['z'].values - dc.variables['z'].values[0]
err = np.sqrt(dc.variables['z_ci'].values ** 2 + dc.variables['z_ci'].values[0] ** 2)
ref_elev = dc.variables['z'].values[0]
slope = get_slope(st.make_geoimg(dc, 0)).img
slope[np.logical_or(~np.isfinite(slope), slope > 70)] = np.nan
t, y, x = np.shape(dh)
dx = np.round((dc.x.max().values - dc.x.min().values) / float(x))
df = volt.hypso_dc(dh,err,ref_elev,mask,dx,**kwargs)
return df
def get_inters_stack_dem(list_fn_stack,ext,proj):
# get stacks that intersect the extent
poly = vt.poly_from_extent(ext)
list_inters_stack = []
for fn_stack in list_fn_stack:
ds = xr.open_dataset(fn_stack)
ext_st, proj_st = st.extent_stack(ds)
poly_st = vt.poly_from_extent(ext_st)
trans = vt.coord_trans(True, proj_st, True, proj)
poly_st.Transform(trans)
if poly_st.Intersect(poly):
list_inters_stack.append(fn_stack)
return list_inters_stack
def comp_stacks_dem(list_fn_stack,fn_dem,inc_mask=None, exc_mask=None, get_timelapse_filtstack=False, outfile=None):
#get list of intersecting stacks
ext, proj = vt.extent_rast(fn_dem)
list_inters_stack = get_inters_stack_dem(list_fn_stack,ext,proj)
dem_full = GeoImg(fn_dem)
list_dh = list_z_score = list_dt = []
for fn_stack in list_inters_stack:
ds = xr.open_dataset(fn_stack)
ref_img = st.make_geoimg(ds)
mask = create_stable_mask(ref_img,exc_mask,inc_mask)
dem = dem_full.reproject(ref_img)
x,y = np.where(mask)
t = np.array([dem.datetime]*len(x))
dem_sub = dem[x,y]
comp = ds.interp(time=t,x=x,y=y)
comp_h = comp.variables['z'].values
comp_ci = comp.variables['z_ci'].values
dh = dem_sub - comp_h
z_score = dh / comp_ci
list_dh.append(dh)
list_z_score.append(z_score)
if get_timelapse_filtstack:
#here we assume filtered stack is stored at the same location
fn_filt = os.path.join(os.path.dirname(fn_stack),os.path.basename(fn_stack).split('_')[0]+'_filtered.nc')
ds_filt = xr.open_dataset(fn_filt)
ds_near = ds_filt.isel(time=t,x=x,y=y,method='nearest')
near_times = ds_near.time.values
delta_t = near_times - t
list_dt.append(delta_t)
full_dh = np.concatenate(list_dh)
full_z_score = np.concatenate(list_z_score)
full_dt = np.concatenate(list_dt)
df = pd.DataFrame()
df = df.assign(dh=full_dh,z_score=full_z_score,dt=full_dt)
if outfile is None:
return df
else:
df.to_csv(outfile)
def datetime_to_yearfraction(date):
#ref:https://stackoverflow.com/questions/6451655/python-how-to-convert-datetime-dates-to-decimal-years
def sinceEpoch(date): #returns seconds since epoch
return time.mktime(date.timetuple())
s = sinceEpoch
year = date.year
startOfThisYear = dt(year=year, month=1, day=1)
startOfNextYear = dt(year=year+1, month=1, day=1)
yearElapsed = s(date) - s(startOfThisYear)
yearDuration = s(startOfNextYear) - s(startOfThisYear)
fraction = yearElapsed/yearDuration
return date.year + fraction
def create_tmp_dir_for_outfile(file_out):
tmp_dir = os.path.join(os.path.dirname(file_out), 'tmp_'+os.path.splitext(os.path.basename(file_out))[0]) + os.path.sep
if not os.path.exists(tmp_dir):
os.mkdir(tmp_dir)
return tmp_dir
def remove_tmp_dir_for_outfile(file_out):
tmp_dir = os.path.join(os.path.dirname(file_out), 'tmp_'+os.path.splitext(os.path.basename(file_out))[0]) + os.path.sep
if os.path.exists(tmp_dir):
shutil.rmtree(tmp_dir,ignore_errors=True)
[docs]def saga_aspect_slope_curvature(dem_in, topo_out, method_nb=5):
"""
Derive maximum curvature using SAGA (command line)
:param dem_in: dem input
:param topo_out: raster out (3 bands: aspect, slope, curvature)
:param method_nb: algorithm used, see function description
:return:
requirement: SAGA 2.X with X>3
#ref:http://www.saga-gis.org/saga_tool_doc/2.2.3/ta_morphometry_0.html
aspect, slope, curvature methods
methods number
[0] maximum slope (Travis et al. 1975)
[1] maximum triangle slope (Tarboton 1997)
[2] least squares fitted plane (Horn 1981, Costa-Cabral & Burgess 1996)
[3] 6 parameter 2nd order polynom (Evans 1979)
[4] 6 parameter 2nd order polynom (Heerdegen & Beran 1982)
[5] 6 parameter 2nd order polynom (Bauer, Rohdenburg, Bork 1985)
[6] 9 parameter 2nd order polynom (Zevenbergen & Thorne 1987)
[7] 10 parameter 3rd order polynom (Haralick 1983)
unit slope 0=rad, 1=deg, 2=percent
unit aspect 0=rad, 1=deg
"""
tmp_dir = create_tmp_dir_for_outfile(topo_out)
saga_elev = tmp_dir + 'elev_temp.sgrd'
saga_slope = tmp_dir + 'slope_temp.sgrd'
saga_aspect = tmp_dir + 'aspect_temp.sgrd'
saga_max_curv = tmp_dir + 'max_curv_temp.sgrd'
tif_slope = tmp_dir + 'slope_temp.tif'
tif_aspect = tmp_dir + 'aspect_temp.tif'
tif_max_curv = tmp_dir + 'max_curv_temp.tif'
output_vrt = tmp_dir + 'stack.vrt'
os.system('saga_cmd io_gdal 0 -GRIDS ' + saga_elev + ' -FILES ' + dem_in)
os.system(
'saga_cmd ta_morphometry 0 -ELEVATION ' + saga_elev + ' -SLOPE ' + saga_slope + ' -ASPECT ' + saga_aspect + ' -C_MAXI ' + saga_max_curv + ' -METHOD ' + str(
method_nb) + ' -UNIT_SLOPE 1 -UNIT_ASPECT 1')
os.system('saga_cmd io_gdal 2 -GRIDS ' + saga_slope + ' -FILE ' + tif_slope)
os.system('saga_cmd io_gdal 2 -GRIDS ' + saga_aspect + ' -FILE ' + tif_aspect)
os.system('saga_cmd io_gdal 2 -GRIDS ' + saga_max_curv + ' -FILE ' + tif_max_curv)
os.system(
'gdalbuildvrt -separate -overwrite ' + output_vrt + ' ' + tif_slope + ' ' + tif_aspect + ' ' + tif_max_curv)
os.system('gdal_translate ' + output_vrt + ' ' + topo_out)
remove_tmp_dir_for_outfile(topo_out)
[docs]def icesat_comp_wrapper(argsin):
"""
Multiprocessing wrapper to compare ICESat data with elevation time series
:param argsin: Tuple of arranged data (see comp_stacks_icesat)
:returns: Arranged output data (see_comp_stacks_icesat)
"""
fn_stack,ice_coords,ice_latlon,ice_elev,ice_date,groups,dates,read_filt,gla_mask,inc_mask,exc_mask = argsin
full_h, full_dh, full_z_score, full_dt, full_pos, full_slp, full_lat, full_lon, full_dh_ref, full_curv, full_dh_tot, full_time = ([] for i in range(12))
# full_time = np.array([],dtype='datetime64[D]')
ds = xr.open_dataset(fn_stack)
ds.load()
tile_name = st.tilename_stack(ds)
tmp_h = st.make_geoimg(ds)
tmp_ci = st.make_geoimg(ds)
tmp_dhtot = st.make_geoimg(ds)
tmp_slope = st.make_geoimg(ds)
tmp_slope.img = ds.slope.values
tmp_dhtot.img = ds.z[-1, :].values - ds.z[0, :].values
# tmp_slope.img = np.ones(np.shape(tmp_slope.img))*20
ds_sub = ds.interp(time=dates)
terrain_mask = None
get_curv = True
#glacier mask has priority and won't be changed, gla = 2
if gla_mask is not None:
print('Tile '+tile_name+': deriving terrain glacier mask...')
#rasterizing mask and changing to float to use geoimg.raster_points2
mask = ft.get_stack_mask(gla_mask,ds)
tmp_exc_mask = st.make_geoimg(ds)
tmp_exc_mask.img = np.zeros(np.shape(mask),dtype='float32')
tmp_exc_mask.img[mask] = 2
terrain_mask = tmp_exc_mask
#here we take the difference of inclusion and exclusion mask for the rest of terrain, inc = 1, exc = 0
if inc_mask is not None:
print('Tile ' + tile_name + ': deriving terrain inclusion mask...')
# rasterizing mask and changing to float to use geoimg.raster_points2
mask = ft.get_stack_mask(inc_mask, ds)
if terrain_mask is not None:
land_mask = np.logical_and(terrain_mask.img == 0, mask)
terrain_mask.img[land_mask] = 1
else:
tmp_inc_mask = st.make_geoimg(ds)
tmp_inc_mask.img = np.zeros(np.shape(mask),dtype='float32')
tmp_inc_mask[mask] = 1
terrain_mask = tmp_inc_mask
if exc_mask is not None:
print('Tile ' + tile_name + ': deriving terrain exclusion mask...')
# rasterizing mask and changing to float to use geoimg.raster_points2
mask = ft.get_stack_mask(exc_mask, ds)
if terrain_mask is not None:
noland_mask = np.logical_and(terrain_mask.img <= 1, mask)
terrain_mask.img[noland_mask] = 0
else:
tmp_exc_mask = st.make_geoimg(ds)
tmp_exc_mask.img = np.ones(np.shape(mask), dtype='float32')
tmp_exc_mask[mask] = 0
terrain_mask = tmp_exc_mask
fn_raw = os.path.join(os.path.dirname(fn_stack), os.path.basename(fn_stack).split('_')[0] + '.nc')
ds_raw = xr.open_dataset(fn_raw)
tmp_ref_h = st.make_geoimg(ds_raw,band=(slice(0,None),slice(0,None)),var='ref_z')
tmp_ref_h = tmp_ref_h.reproject(tmp_h)
#get curvature
if get_curv:
fn_tmp_ref = os.path.join(os.path.dirname(fn_stack),'tmp_ref_'+os.path.basename(fn_stack).split('_')[0]+'.tif')
tmp_ref_h.write(fn_tmp_ref)
fn_tmp_curv = os.path.join(os.path.dirname(fn_stack),'tmp_curv_'+os.path.basename(fn_stack).split('_')[0]+'.tif')
saga_aspect_slope_curvature(fn_tmp_ref,fn_tmp_curv)
ds_curv = gdal.Open(fn_tmp_curv,gdal.GA_ReadOnly)
curv_arr = ds_curv.GetRasterBand(3).ReadAsArray()
tmp_curv = GeoImg(fn_tmp_curv)
tmp_curv.img = curv_arr
ds_curv = None
os.remove(fn_tmp_curv)
os.remove(fn_tmp_ref)
else:
tmp_curv = st.make_geoimg(ds)
tmp_curv.img = np.zeros(np.shape(tmp_curv.img),dtype='float32')
if read_filt:
print('Tile '+tile_name+': getting original data dates from filtered array...')
#we read the boolean data cube indicating positions where original data was used
tmp_dt = st.make_geoimg(ds)
fn_filt = os.path.join(os.path.dirname(fn_stack), os.path.basename(fn_stack).split('_')[0] + '_filtered.nc')
ds_filt = xr.open_dataset(fn_filt)
ds_filt.load()
#first, remove duplicate dates by merging boolean arrays for same dates
t_vals = list(ds_filt.time.values)
dates_rm_dupli = sorted(list(set(t_vals)))
ind_firstdate = []
for i, date in enumerate(dates_rm_dupli):
ind_firstdate.append(t_vals.index(date))
ds_filt2 = ds_filt.isel(time=np.array(ind_firstdate))
for i in range(len(dates_rm_dupli)):
t_ind = (t_vals == dates_rm_dupli[i])
if len(t_ind) > 1:
ds_filt2.z.values[i, :] = np.any(ds_filt.z[t_vals == dates_rm_dupli[i], :].values.astype(bool), axis=0)
# getting time data as days since 2000
y0 = np.datetime64('2000-01-01')
ftime = ds_filt2.time.values
ftime_delta = np.array([t - y0 for t in ftime])
days = [td.astype('timedelta64[D]').astype(int) for td in ftime_delta]
# reindex to get closest not-NaN time value of the date vector
filt_arr = np.array(ds_filt2.z.values, dtype='float32')
filt_arr[filt_arr == 0] = np.nan
days = np.array(days)
filt_arr = filt_arr * days[:, None, None]
at_least_2 = np.count_nonzero(~np.isnan(filt_arr), axis=0) >= 2
filt_tmp = filt_arr[:, at_least_2]
out_arr = np.copy(filt_tmp)
for i in range(np.shape(filt_tmp)[1]):
ind = ~np.isnan(filt_tmp[:, i])
fn = interp1d(days[ind], filt_tmp[:, i][ind], kind='nearest', fill_value='extrapolate', assume_sorted=True)
out_arr[:, i] = fn(days)
filt_arr[:, at_least_2] = out_arr
ds_filt2.z.values = filt_arr
ds_filt_sub = ds_filt2.reindex(time=dates, method='nearest')
for i, group in enumerate(groups):
#keep only a campaign group
pts_idx_dt = np.array(ice_date == group)
date = dates[i]
print('Tile '+tile_name+': calculating differences for the ' + str(
np.count_nonzero(pts_idx_dt)) + ' points of campaign:' + str(date))
subsamp_ice = [tup for i, tup in enumerate(ice_coords) if pts_idx_dt[i]]
subsamp_latlon = [tup for i, tup in enumerate(ice_latlon) if pts_idx_dt[i]]
tmp_h.img = ds_sub.z[i, :].values
comp_pts_h = tmp_h.raster_points(subsamp_ice, nsize=3, mode='linear')
tmp_ci.img = ds_sub.z_ci[i, :].values
comp_pts_ci = tmp_ci.raster_points(subsamp_ice, nsize=3, mode='linear')
comp_pts_dhtot = tmp_dhtot.raster_points(subsamp_ice, nsize=3, mode='linear')
comp_pts_slope = tmp_slope.raster_points(subsamp_ice, nsize=3, mode='linear')
comp_pts_ref_h = tmp_ref_h.raster_points(subsamp_ice, nsize=3, mode='linear')
comp_pts_curv = tmp_curv.raster_points(subsamp_ice,nsize=3,mode='linear')
dh = ice_elev[pts_idx_dt] - comp_pts_h
good_vals = np.isfinite(dh)
dh = dh[good_vals]
h = ice_elev[pts_idx_dt][good_vals]
z_score = dh / comp_pts_ci[good_vals]
slp = comp_pts_slope[good_vals]
dh_ref = ice_elev[pts_idx_dt] - comp_pts_ref_h
dh_ref = dh_ref[good_vals]
curv = comp_pts_curv[good_vals]
dh_tot = comp_pts_dhtot[good_vals]
if read_filt:
day_diff = (date - y0).astype('timedelta64[D]').astype(int)
tmp_dt.img = np.abs(ds_filt_sub.z[i, :].values - np.ones(np.shape(tmp_dt.img)) * day_diff)
comp_pts_dt = tmp_dt.raster_points(subsamp_ice, nsize=3, mode='mean')
dt_out = comp_pts_dt[good_vals]
else:
dt_out = np.zeros(len(dh)) * np.nan
if terrain_mask is not None:
comp_pts_mask = terrain_mask.raster_points(subsamp_ice, nsize=5, mode='nearest')
pos = comp_pts_mask[good_vals]
else:
pos = np.zeros(len(dh))
good_subsamp_latlon = [tup for i, tup in enumerate(subsamp_latlon) if good_vals[i]]
if len(good_subsamp_latlon) > 0:
lon = np.array([tup[1] for i, tup in enumerate(good_subsamp_latlon)])
lat = np.array([tup[0] for i, tup in enumerate(good_subsamp_latlon)])
else:
lat=np.array([])
lon=np.array([])
full_h.append(h)
full_dh.append(dh)
full_z_score.append(z_score)
full_dt.append(dt_out)
full_pos.append(pos)
full_slp.append(slp)
full_time.append(np.array([date]*len(dh),dtype='datetime64[D]'))
full_curv.append(curv)
full_lat.append(lat)
full_lon.append(lon)
full_dh_ref.append(dh_ref)
full_dh_tot.append(dh_tot)
full_h = np.concatenate(full_h)
full_dh = np.concatenate(full_dh)
full_z_score = np.concatenate(full_z_score)
full_dt = np.concatenate(full_dt)
full_pos = np.concatenate(full_pos)
full_slp = np.concatenate(full_slp)
full_time = np.concatenate(full_time)
full_lat = np.concatenate(full_lat)
full_lon = np.concatenate(full_lon)
full_dh_ref = np.concatenate(full_dh_ref)
full_curv = np.concatenate(full_curv)
full_dh_tot = np.concatenate(full_dh_tot)
return full_h, full_dh, full_z_score, full_dt, full_pos, full_slp, full_time, full_lon, full_lat, full_dh_ref, full_curv, full_dh_tot
[docs]def comp_stacks_icesat(list_fn_stack,fn_icesat,gla_mask=None,inc_mask=None,exc_mask=None,nproc=1,read_filt=False,shift=None):
"""
Compare ICESat data with elevation time series
:param list_fn_stack: List of filename of netCDF to compare to
:param fn_icesat: Filename of ICESat HDF5 file
:param gla_mask: Filename of glacier shapefile
:param inc_mask: Filename of inclusion shapefile (e.g. land)
:param exc_mask: Filename of exclusion shapeifle (e.g. ocean)
:param nproc: Number of cores used for multiprocessing [1]
:param read_filt: Boolean, read boolean data cube of valid observation
:returns: Arranged output data (see_comp_stacks_icesat)
"""
ice = ICESat(fn_icesat)
el_limit = -200
mykeep = ice.elev > el_limit
#exception for North Asia subreg 3 ICESat file
if os.path.basename(fn_icesat) == 'ICESat_10_3_rgi50_NorthAsia.h5':
keep_west = np.logical_and(ice.lon<-172,ice.lon>-179.999)
keep_east_1 = np.logical_and(ice.lon<96,ice.lon>88)
keep_east_2 = np.logical_and(ice.lon<146.999,ice.lon>138)
keep_northasia_3 = np.logical_or.reduce((keep_west,keep_east_1,keep_east_2))
mykeep = np.logical_and(mykeep,keep_northasia_3)
ice.x = ice.x[mykeep]
ice.y = ice.y[mykeep]
ice.lat = ice.lat[mykeep]
ice.lon = ice.lon[mykeep]
ice.elev = ice.elev[mykeep]
ice.UTCTime = ice.UTCTime[mykeep]
ice.xy = list(zip(ice.x, ice.y))
#putting code above in ICESat.clean() breaks co-registration...
# ice.clean(el_limit=-200)
#get intersecting tiles
bounds = ice.get_bounds()
proj = osr.SpatialReference()
proj.ImportFromEPSG(4326)
proj_wkt = proj.ExportToWkt()
list_inters = get_inters_stack_dem(list_fn_stack,[bounds[0],bounds[2],bounds[1],bounds[3]],proj_wkt)
if len(list_inters) == 0:
print('Found no intersection to ICESat file: '+fn_icesat)
raise ValueError('No intersecting file!')
#laser campaigns of ICESat
laser_op_icesat = [(dt(2003, 2, 20), dt(2003, 3, 29)), (dt(2003, 9, 25), dt(2003, 11, 19)),
(dt(2004, 2, 17), dt(2004, 3, 21)),
(dt(2004, 5, 18), dt(2004, 6, 21)), (dt(2004, 10, 3), dt(2004, 11, 8)),
(dt(2005, 2, 17), dt(2005, 3, 24)),
(dt(2005, 5, 20), dt(2005, 6, 23)), (dt(2005, 10, 21), dt(2005, 11, 24)),
(dt(2006, 2, 22), dt(2006, 3, 28)),
(dt(2006, 5, 24), dt(2006, 6, 26)), (dt(2006, 10, 25), dt(2006, 11, 27)),
(dt(2007, 3, 12), dt(2007, 4, 14)),
(dt(2007, 10, 2), dt(2007, 11, 5)), (dt(2008, 2, 17), dt(2008, 3, 21)),
(dt(2008, 10, 4), dt(2008, 10, 19)),
(dt(2008, 11, 25), dt(2008, 12, 17)), (dt(2009, 3, 9), dt(2009, 4, 11)),
(dt(2009, 9, 30), dt(2009, 10, 11))]
#campaign names
laser_op_name = ['1AB', '2A', '2B', '2C', '3A', '3B', '3C', '3D', '3E', '3F', '3G', '3H', '3I', '3J', '3K', '2D',
'2E', '2F']
# group ICESat dates by operation
utc_days = ice.UTCTime
for i in range(len(laser_op_icesat)):
# datetime to UTC days
start_dt = datetime_to_yearfraction(laser_op_icesat[i][0]) * 365.2422 - 10
end_dt = datetime_to_yearfraction(laser_op_icesat[i][1]) * 365.2422 + 10
mean_dt = 0.5 * (start_dt + end_dt)
idx = np.logical_and(utc_days < end_dt, utc_days > start_dt)
utc_days[idx] = mean_dt
groups = sorted(list(set(list(utc_days))))
dates = np.array(
[np.datetime64('01-01-01') + np.timedelta64(int(np.floor(group - 365.2422)), 'D') for group in groups])
#prepare icesat arrays to pass to wrapper per stack, to avoid reading HDF5/NetCDF multiple times if doing parallel
icesat_argsin = []
for fn_stack in list_inters:
ds = xr.open_dataset(fn_stack)
tile_name = st.tilename_stack(ds)
lat, lon = vt.SRTMGL1_naming_to_latlon(tile_name)
pts_idx = np.logical_and.reduce((ice.lat > lat, ice.lat <= lat + 1, ice.lon > lon, ice.lon <= lon + 1))
check = np.count_nonzero(pts_idx)
print('Tile ' + tile_name + ': found ' + str(check) + ' ICESat points')
if check > 0:
_, utm = vt.latlon_to_UTM(lat, lon)
ice.project('epsg:{}'.format(vt.epsg_from_utm(utm)))
ice_coords = [tup for i, tup in enumerate(ice.xy) if pts_idx[i]]
ice_elev = ice.elev[pts_idx]
ice_date = ice.UTCTime[pts_idx]
ice_latlon = np.array(list(zip(ice.lat[pts_idx],ice.lon[pts_idx])))
icesat_argsin.append((fn_stack,np.copy(ice_coords),np.copy(ice_latlon),np.copy(ice_elev),np.copy(ice_date),np.copy(groups),np.copy(dates),read_filt,gla_mask,inc_mask,exc_mask))
if nproc == 1:
list_h, list_dh, list_zsc, list_dt, list_pos, list_slp, list_time, list_lat, list_lon, list_dh_ref, list_curv, list_dh_tot = ([] for i in range(11))
for i in range(len(icesat_argsin)):
tmp_h, tmp_dh, tmp_zsc, tmp_dt, tmp_pos, tmp_slp, tmp_time, tmp_lon, tmp_lat, tmp_dh_ref, tmp_curv, tmp_dh_tot = icesat_comp_wrapper(icesat_argsin[i])
list_h.append(tmp_h)
list_dh.append(tmp_dh)
list_zsc.append(tmp_zsc)
list_dt.append(tmp_dt)
list_pos.append(tmp_pos)
list_slp.append(tmp_slp)
list_time.append(tmp_time)
list_lat.append(tmp_lat)
list_lon.append(tmp_lon)
list_dh_ref.append(tmp_dh_ref)
list_curv.append(tmp_curv)
list_dh_tot.append(tmp_dh_tot)
h = np.concatenate(list_h)
dh = np.concatenate(list_dh)
zsc = np.concatenate(list_zsc)
dt_out = np.concatenate(list_dt)
pos = np.concatenate(list_pos)
slp = np.concatenate(list_slp)
t = np.concatenate(list_time)
lat = np.concatenate(list_lat)
lon = np.concatenate(list_lon)
dh_ref = np.concatenate(list_dh_ref)
curv = np.concatenate(list_curv)
dh_tot = np.concatenate(list_dh_tot)
else:
nproc=min(len(icesat_argsin),nproc)
print('Using '+str(nproc)+' processors...')
pool = mp.Pool(nproc,maxtasksperchild=1)
outputs = pool.map(icesat_comp_wrapper,icesat_argsin)
pool.close()
pool.join()
zip_out = list(zip(*outputs))
h = np.concatenate(zip_out[0])
dh = np.concatenate(zip_out[1])
zsc = np.concatenate(zip_out[2])
dt_out = np.concatenate(zip_out[3])
pos = np.concatenate(zip_out[4])
slp = np.concatenate(zip_out[5])
t = np.concatenate(zip_out[6])
lon = np.concatenate(zip_out[7])
lat = np.concatenate(zip_out[8])
dh_ref = np.concatenate(zip_out[9])
curv = np.concatenate(zip_out[10])
dh_tot = np.concatenate(zip_out[11])
return h, dh, zsc, dt_out, pos, slp, t, lon, lat, dh_ref, curv, dh_tot
[docs]def comp_stacks_icebridge(list_fn_stack,fn_icebridge,gla_mask=None,inc_mask=None,exc_mask=None,nproc=1,read_filt=False):
"""
Compare IceBridge data with elevation time series
:param list_fn_stack: List of filename of netCDF to compare to
:param fn_icebridge: Filename of IceBridge post-processed into .csv
:param gla_mask: Filename of glacier shapefile
:param inc_mask: Filename of inclusion shapefile (e.g. land)
:param exc_mask: Filename of exclusion shapeifle (e.g. ocean)
:param nproc: Number of cores used for multiprocessing [1]
:param read_filt: Boolean, read boolean data cube of valid observation
:returns: Arranged output data (see_comp_stacks_icesat)
"""
iceb = pd.read_csv(fn_icebridge)
#get intersecting tiles
bounds = (iceb.lon.min(), iceb.lon.max(), iceb.lat.min(), iceb.lat.max())
proj = osr.SpatialReference()
proj.ImportFromEPSG(4326)
proj_wkt = proj.ExportToWkt()
list_inters = get_inters_stack_dem(list_fn_stack,[bounds[0],bounds[2],bounds[1],bounds[3]],proj_wkt)
if len(list_inters) == 0:
print('Found no intersection to IceBridge file: '+fn_icebridge)
raise ValueError('No intersecting file!')
groups = sorted(list(set(list(pd.to_datetime(iceb.t.values).values.astype('datetime64[D]')))))
dates = groups
#prepare icesat arrays to pass to wrapper per stack, to avoid reading HDF5/NetCDF multiple times if doing parallel
icebridge_argsin = []
for fn_stack in list_inters:
ds = xr.open_dataset(fn_stack)
tile_name = st.tilename_stack(ds)
lat, lon = vt.SRTMGL1_naming_to_latlon(tile_name)
pts_idx = np.logical_and.reduce((iceb.lat > lat, iceb.lat <= lat + 1, iceb.lon > lon, iceb.lon <= lon + 1))
check = np.count_nonzero(pts_idx)
print('Tile ' + tile_name + ': found ' + str(check) + ' IceBridge points')
if check > 0:
_, utm = vt.latlon_to_UTM(lat, lon)
dest_proj ='epsg:{}'.format(vt.epsg_from_utm(utm))
dest_proj = pyproj.Proj(init=dest_proj)
wgs84 = pyproj.Proj(init='epsg:4326')
x, y = pyproj.transform(wgs84, dest_proj, iceb.lon.values, iceb.lat.values)
xy = list(zip(x,y))
ice_coords = [tup for i, tup in enumerate(xy) if pts_idx[i]]
ice_elev = iceb.h[pts_idx].values
ice_date = pd.to_datetime(iceb.t[pts_idx].values).values.astype('datetime64[D]')
ice_latlon = np.array(list(zip(iceb.lat[pts_idx].values,iceb.lon[pts_idx].values)))
icebridge_argsin.append((fn_stack,np.copy(ice_coords),np.copy(ice_latlon),np.copy(ice_elev),np.copy(ice_date),np.copy(groups),np.copy(dates),read_filt,gla_mask,inc_mask,exc_mask))
if nproc == 1:
list_h, list_dh, list_zsc, list_dt, list_pos, list_slp, list_time, list_lat, list_lon, list_dh_ref, list_curv, list_dh_tot = ([] for i in range(11))
for i in range(len(icebridge_argsin)):
tmp_h, tmp_dh, tmp_zsc, tmp_dt, tmp_pos, tmp_slp, tmp_time, tmp_lon, tmp_lat, tmp_dh_ref, tmp_curv, tmp_dh_tot = icesat_comp_wrapper(icebridge_argsin[i])
list_h.append(tmp_h)
list_dh.append(tmp_dh)
list_zsc.append(tmp_zsc)
list_dt.append(tmp_dt)
list_pos.append(tmp_pos)
list_slp.append(tmp_slp)
list_time.append(tmp_time)
list_lat.append(tmp_lat)
list_lon.append(tmp_lon)
list_dh_ref.append(tmp_dh_ref)
list_curv.append(tmp_curv)
list_dh_tot.append(tmp_dh_tot)
h = np.concatenate(list_h)
dh = np.concatenate(list_dh)
zsc = np.concatenate(list_zsc)
dt_out = np.concatenate(list_dt)
pos = np.concatenate(list_pos)
slp = np.concatenate(list_slp)
t = np.concatenate(list_time)
lat = np.concatenate(list_lat)
lon = np.concatenate(list_lon)
dh_ref = np.concatenate(list_dh_ref)
curv = np.concatenate(list_curv)
dh_tot = np.concatenate(list_dh_tot)
else:
nproc=min(len(icebridge_argsin),nproc)
print('Using '+str(nproc)+' processors...')
pool = mp.Pool(nproc,maxtasksperchild=1)
outputs = pool.map(icesat_comp_wrapper,icebridge_argsin)
pool.close()
pool.join()
zip_out = list(zip(*outputs))
h = np.concatenate(zip_out[0])
dh = np.concatenate(zip_out[1])
zsc = np.concatenate(zip_out[2])
dt_out = np.concatenate(zip_out[3])
pos = np.concatenate(zip_out[4])
slp = np.concatenate(zip_out[5])
t = np.concatenate(zip_out[6])
lon = np.concatenate(zip_out[7])
lat = np.concatenate(zip_out[8])
dh_ref = np.concatenate(zip_out[9])
curv = np.concatenate(zip_out[10])
dh_tot = np.concatenate(zip_out[11])
return h, dh, zsc, dt_out, pos, slp, t, lon, lat, dh_ref, curv, dh_tot
def shift_icesat_stack(fn_ref,fn_icesat,fn_shp):
_ , _ , shift_params, stats = dem_coregistration(fn_icesat,fn_ref,glaciermask=fn_shp,pts=True,inmem=True)
return shift_params, stats
def combine_postproc_stacks_tvol(list_fn_stack, fn_shp, feat_id='RGIId', tlim=None, write_combined=True, outdir='.'):
# get all rgiid intersecting stacks and the list of intersecting stacks
all_rgiids, list_list_stacks = inters_feat_shp_stacks(fn_shp, list_fn_stack, feat_id)
# sort by rgiid group with same intersecting stacks
list_tuples = list(zip(all_rgiids, list_list_stacks))
grouped = [(k, list(list(zip(*g))[0])) for k, g in groupby(list_tuples, itemgetter(1))]
# loop through similar combination of stacks (that way, only have to combine/open them once)
for i in range(len(grouped)):
list_fn_stack_pack = grouped[i][0]
rgiid_pack = grouped[i][1]
list_ds = st.open_datasets(list_fn_stack_pack)
if len(list_ds) > 1:
ds = st.combine_stacks(list_ds)
else:
ds = list_ds[0]
if write_combined:
list_tile = [os.path.splitext(os.path.basename(fn))[0].split('_')[0] for fn in list_fn_stack_pack]
out_nc = os.path.join(outdir, 'combined_stacks', '_'.join(list_tile))
ds.to_netcdf(out_nc)
df_tot = pd.DataFrame()
# loop through rggiids
for rgiid in rgiid_pack:
ds_shp = gdal.OpenEx(fn_shp, gdal.OF_VECTOR)
layer_name = os.path.splitext(os.path.basename(fn_shp))[0]
geoimg = st.make_geoimg(ds, 0)
mask = vt.geoimg_mask_on_feat_shp_ds(ds_shp, geoimg, layer_name=layer_name, feat_id=feat_id, feat_val=rgiid)
dc, submask, _ = sel_dc(ds, tlim, mask)
df = int_dc(dc, submask)
df_tot.append(df)
[docs]def get_dt_closest_valid(ds_filt,dates):
"""
Derive data cube of time lag to closest valid observation based on the date vector of the time series and a boolean data cube of valid observation at original dates
:param ds_filt: Boolean data cube of valid observations
:param dates: Time vector of time series
:returns: Data cube indexed to time vector with time lags to observation, Data cube with count of valid observation between time index, Data cube with yearly count of valid observation at the first yearly time index
"""
# we read the boolean data cube indicating positions where original data was used
# first, remove duplicate dates by merging boolean arrays for same dates
t_vals = list(ds_filt.time.values)
dates_rm_dupli = sorted(list(set(t_vals)))
ind_firstdate = []
for i, date in enumerate(dates_rm_dupli):
ind_firstdate.append(t_vals.index(date))
ds_filt2 = ds_filt.isel(time=np.array(ind_firstdate))
for i in range(len(dates_rm_dupli)):
t_ind = (t_vals == dates_rm_dupli[i])
if len(t_ind) > 1:
ds_filt2.z.values[i, :] = np.any(ds_filt.z[t_vals == dates_rm_dupli[i], :].values.astype(bool), axis=0)
valid_obs = np.zeros((len(dates),ds_filt2.z.shape[1],ds_filt2.z.shape[2]))
for i in range(len(dates)-1):
ind = np.logical_and(dates_rm_dupli>=dates[i],dates_rm_dupli<dates[i+1])
valid_obs[i,:,:] = np.sum(ds_filt2.z.values[ind,:],axis=0)
#not robust, works only for monthly (pressé!)
valid_obs_peryear = np.zeros((len(dates), ds_filt2.z.shape[1], ds_filt2.z.shape[2]))
for i in range(len(dates) - 1):
if i % 12 == 0:
ind = np.logical_and(dates_rm_dupli >= dates[i], dates_rm_dupli < dates[i + 12])
valid_obs_peryear[i, :, :] = np.any(ds_filt2.z.values[ind, :], axis=0)
# getting time data as days since 2000
y0 = np.datetime64('2000-01-01')
ftime = ds_filt2.time.values
ftime_delta = np.array([t - y0 for t in ftime])
days = [td.astype('timedelta64[D]').astype(int) for td in ftime_delta]
# reindex to get closest not-NaN time value of the date vector
filt_arr = np.array(ds_filt2.z.values, dtype='float32')
filt_arr[filt_arr == 0] = np.nan
days = np.array(days)
filt_arr = filt_arr * days[:, None, None]
at_least_2=np.count_nonzero(~np.isnan(filt_arr),axis=0)>=2
filt_tmp = filt_arr[:,at_least_2]
out_arr = np.copy(filt_tmp)
for i in range(np.shape(filt_tmp)[1]):
ind = ~np.isnan(filt_tmp[:,i])
fn = interp1d(days[ind],filt_tmp[:,i][ind],kind='nearest',fill_value='extrapolate',assume_sorted=True)
out_arr[:,i] = fn(days)
filt_arr[:,at_least_2] = out_arr
ds_filt2.z.values = filt_arr
ds_filt_sub = ds_filt2.reindex(time=dates, method='nearest')
for i in range(len(dates)):
date = dates[i]
day_diff = (date - y0).astype('timedelta64[D]').astype(int)
ds_filt_sub.z.values[i,:] = np.abs(ds_filt_sub.z.values[i, :] - np.ones(ds_filt_sub.z.shape[1:3]) * day_diff)
return ds_filt_sub, valid_obs, valid_obs_peryear
[docs]def sel_int_hypsocheat(argsin):
"""
Multiprocessing wrapper by shapefile feature for "hypsometric cheat" volume integration of elevation time series
:param argsin: Tuple with: list of filename of netCDF stacks, shapefile parameters (see hypsocheat_postproc_stacks_tvol), time limits, processing index
:returns: Three dataframes of volume change (see hypso_dc in volint_tools.py)
"""
# integrate volume without having to know the exact spatial disposition: works for hypsometric (needs only reference elevation)
# a LOT faster as we don't need to combine (reproject + merge) stacks
list_fn_stack_pack, shp_params, tlim, i = argsin
print('Integrating volume for outline group: '+str(i+1))
list_ds = st.open_datasets(list_fn_stack_pack,load=True)
dates = list_ds[0].time.values
list_ds_filt = []
for fn_stack in list_fn_stack_pack:
fn_filt = os.path.join(os.path.dirname(fn_stack), os.path.basename(fn_stack).split('_')[0] + '_filtered.nc')
ds_filt = xr.open_dataset(fn_filt)
ds_filt.load()
list_ds_filt.append(ds_filt)
fn_shp, feat_id, list_feat_val = shp_params
ds_shp = gdal.OpenEx(fn_shp, gdal.OF_VECTOR)
layer_name = os.path.splitext(os.path.basename(fn_shp))[0]
layer = ds_shp.GetLayer()
list_lat, list_lon, list_err_area, list_area = ([] for i in range(4))
for feat_val in list_feat_val:
for feature in layer:
if feat_val == feature.GetField(feat_id):
geom = feature.GetGeometryRef()
centroid_lon, centroid_lat, _ = geom.Centroid().GetPoint()
list_lon.append(centroid_lon)
list_lat.append(centroid_lat)
#get the area ratio for a 50 meter buffer
proj_in = osr.SpatialReference()
proj_in.ImportFromEPSG(4326)
perc_area_buff, area = vt.get_buffered_area_ratio(geom,proj_in.ExportToWkt(),15.)
list_area.append(area)
list_err_area.append(perc_area_buff)
layer.ResetReading()
df_tot, df_hyp_tot, df_int_tot = (pd.DataFrame() for i in range(3))
for feat_val in list_feat_val:
dh, err, ref, dt, count_area, valid, valid_py = ([] for i in range(7))
print('Working on feature ID: '+feat_val)
for i, ds in enumerate(list_ds):
ds_filt = list_ds_filt[i]
#get raster equivalent of stack
geoimg = st.make_geoimg(ds, 0)
#get mask of latlon tiling
tile_name = st.tilename_stack(ds)
mask_tile = vt.latlontile_nodatamask(geoimg,tile_name)
mask_feat = vt.geoimg_mask_on_feat_shp_ds(ds_shp, geoimg, layer_name=layer_name, feat_id=feat_id, feat_val=feat_val)
mask = np.logical_and(mask_tile,mask_feat)
if np.count_nonzero(mask) >0:
dc, submask, _ = sel_dc(ds,tlim,mask)
dc_dt, _, _ = sel_dc(ds_filt,tlim,mask)
dc_dt_sub, valid_obs, valid_obs_peryear = get_dt_closest_valid(dc_dt,dates)
tmp_dh = dc.z.values[:,submask] - dc.z.values[0,submask]
tmp_err = np.sqrt(dc.z_ci.values[:,submask]**2 + dc.z_ci.values[0,submask]**2)
tmp_ref = dc.z.values[0,submask]
tmp_dt = dc_dt_sub.z.values[:,submask]
tmp_valid = valid_obs[:, submask]
tmp_valid_peryear = valid_obs_peryear[:,submask]
count = np.count_nonzero(mask)
#exception! to correct in fit_tools at some point?
#when there is a same-date DEM overlapping twice, and those are the only 2 observations over the whole period, the dh is wrongly of "0" and the dt is only NaNs
#temporary fix:
at_least_2 = np.count_nonzero(~np.isnan(tmp_dt), axis=1) >=2
tmp_dh[~at_least_2, :] = np.nan
tmp_err[~at_least_2, :] = np.nan
dh.append(tmp_dh)
err.append(tmp_err)
ref.append(tmp_ref)
dt.append(tmp_dt)
count_area.append(count)
valid.append(tmp_valid)
valid_py.append(tmp_valid_peryear)
else:
continue
ds = list_ds[0]
x = ds.x.shape[0]
dx = np.round((ds.x.max().values - ds.x.min().values) / float(x))
if len(dh)>0 and np.count_nonzero(~np.isnan(np.concatenate(dh,axis=1)))>0:
dh = np.concatenate(dh,axis=1)
err = np.concatenate(err,axis=1)
ref = np.concatenate(ref)
dt = np.concatenate(dt,axis=1)
valid = np.concatenate(valid,axis=1)
valid_py = np.concatenate(valid_py,axis=1)
df, df_hyp, df_int = volt.hypso_dc(dh,err,ref,dt,dates,np.ones(np.shape(ref),dtype=bool),dx)
df_int['valid_obs'] = np.nanmean(valid,axis=1)
df_int['valid_obs_py'] = np.nanmean(valid_py,axis=1)
else:
area = np.sum(np.array(count_area))*dx**2
df = pd.DataFrame()
df = df.assign(hypso=[np.nan], time=[np.nan], dh=[np.nan], err_dh=[np.nan])
df_hyp = pd.DataFrame()
df_hyp = df_hyp.assign(hypso=[np.nan], area_meas=[np.nan], area_tot=[area], nmad=[np.nan])
df_int = pd.DataFrame()
df_int = df_int.assign(time=dates, dh=[np.nan]*len(dates), err_dh=[np.nan]*len(dates), perc_area_meas=[np.nan]*len(dates), valid_obs=[np.nan]*len(dates),valid_obs_py=[np.nan]*len(dates))
df['rgiid'] = feat_val
df_hyp['rgiid'] = feat_val
df_int['rgiid'] = feat_val
df_int['area'] = list_area[list_feat_val.index(feat_val)]
df_int['lon'] = list_lon[list_feat_val.index(feat_val)]
df_int['lat'] = list_lat[list_feat_val.index(feat_val)]
df_int['perc_err_cont'] = list_err_area[list_feat_val.index(feat_val)]
df_tot = df_tot.append(df)
df_hyp_tot = df_hyp_tot.append(df_hyp)
df_int_tot = df_int_tot.append(df_int)
return df_tot, df_hyp_tot, df_int_tot
[docs]def hypsocheat_postproc_stacks_tvol(list_fn_stack, fn_shp, feat_id='RGIId', tlim=None,nproc=1, outfile='int_dh.csv'):
"""
Hypsometric cheat volume integration of elevation time series
The objective of the "cheat" is to highly decrease processing time by allowing to combine parts of shapefile features (e.g., glaciers)
which are projected in different georeferencing systems in the stacks (projected in UTM), without having to merge the stacks into the same projection
Because integration is hypsometric (based on reference elevation of the glacier), we do not need the information of spatial structure of the glacier
:param list_fn_stack: List of filenames of netCDF elevation time series
:param fn_shp: Filename of shapefile
:param feat_id: Feature name of interest
:param tlim: Time limits for integration
:param nproc: Number of cores used for multiprocessing [1]
:param outfile: Filename of output .csv
:returns:
"""
# get all rgiid intersecting stacks and the list of intersecting stacks
start = time.time()
all_rgiids, list_list_stacks = inters_feat_shp_stacks(fn_shp, list_fn_stack, feat_id)
# sort by rgiid group with same intersecting stacks (avoid loading + combining several time similar groups of stacks)
# !!! commenting to speed up things CPU wise for 100 m stacks, but need to uncomment this for higher resolution !!!
# all_rgiids = [rgiid for _, rgiid in sorted(zip(list_list_stacks,all_rgiids))]
# list_list_stacks = sorted(list_list_stacks)
list_tuples = list(zip(all_rgiids, list_list_stacks))
grouped = [(k, list(list(zip(*g))[0])) for k, g in groupby(list_tuples, itemgetter(1))]
print('Found '+str(len(all_rgiids))+' outlines intersecting stacks')
print('Grouped outlines in '+str(len(grouped))+' packs')
print('Elapsed: '+str(time.time()-start))
# loop through similar combination of stacks (that way, only have to combine/open them once)
df_final, df_hyp_final, df_int_final = (pd.DataFrame() for i in range(3))
if nproc == 1:
for i in range(len(grouped)):
list_fn_stack_pack = grouped[i][0]
rgiid_pack = grouped[i][1]
shp_params = (fn_shp,feat_id,rgiid_pack)
df, df_hyp, df_int = sel_int_hypsocheat((list_fn_stack_pack,shp_params,tlim,i))
df_final = df_final.append(df)
df_hyp_final = df_hyp_final.append(df_hyp)
df_int_final = df_int_final.append(df_int)
else:
argsin = [(grouped[i][0], (fn_shp, feat_id, grouped[i][1]), tlim, i) for i in range(len(grouped))]
pool = mp.Pool(nproc,maxtasksperchild=1)
outputs = pool.map(sel_int_hypsocheat,argsin,chunksize=1)
pool.close()
pool.join()
print('Finished processing, elapsed: ' + str(time.time() - start))
zips = list(zip(*outputs))
dfs = zips[0]
dfs_hyp = zips[1]
dfs_int = zips[2]
print('Finished zipping, elapsed: ' + str(time.time() - start))
df_final = pd.concat(dfs)
df_hyp_final = pd.concat(dfs_hyp)
df_int_final = pd.concat(dfs_int)
print('Finished putting in dataframes, elapsed: ' + str(time.time() - start))
fn_csv = os.path.join(os.path.dirname(outfile),os.path.splitext(os.path.basename(outfile))[0]+'_hyp2.csv')
fn_hyp_csv = os.path.join(os.path.dirname(outfile),os.path.splitext(os.path.basename(outfile))[0]+'_hyp.csv')
fn_int_csv = os.path.join(os.path.dirname(outfile),os.path.splitext(os.path.basename(outfile))[0]+'_int.csv')
df_final.to_csv(fn_csv,index=False)
df_hyp_final.to_csv(fn_hyp_csv,index=False)
df_int_final.to_csv(fn_int_csv,index=False)
[docs]def get_base_df_inventory(dir_shp,outfile):
"""
Create a "base" dataframe which contains characteristics of RGIIds features (this is done separately to avoid duplicating that info in GBs of data into elevation time series, as those are 2D DataFrames)
:param dir_shp: Directory containing RGI shapefiles
:param outfile: Output .csv file containing dataframe
"""
list_fn_shp = glob(os.path.join(dir_shp,'*/*.shp'),recursive=True)
df_world = pd.DataFrame()
for fn_shp in list_fn_shp:
print('Working on region:'+fn_shp)
ds_shp = gdal.OpenEx(fn_shp, gdal.OF_VECTOR)
layer = ds_shp.GetLayer()
list_rgiid, list_cl, list_nom, list_reg, list_term, list_area, list_lon, list_lat = ([] for i in range(8))
for feature in layer:
rgiid = feature.GetField('RGIId')
geom = feature.GetGeometryRef()
centroid_lon, centroid_lat, _ = geom.Centroid().GetPoint()
proj_in = osr.SpatialReference()
proj_in.ImportFromEPSG(4326)
_, area = vt.get_buffered_area_ratio(geom, proj_in.ExportToWkt(), 15.)
print('Glacier: '+rgiid)
if rgiid[0] != 'G':
reg = int(rgiid[6:8])
cl = feature.GetField('Connect')
nom = feature.GetField('Status')
term = feature.GetField('TermType')
else:
reg = 12
cl = 0
nom = 0
term = 0
list_rgiid.append(rgiid)
list_cl.append(cl)
list_nom.append(nom)
list_reg.append(reg)
list_term.append(term)
list_area.append(area)
list_lon.append(centroid_lon)
list_lat.append(centroid_lat)
df_reg = pd.DataFrame()
df_reg = df_reg.assign(rgiid=list_rgiid,cl=list_cl,nom=list_nom,reg=list_reg,term=list_term,area=list_area,lon=list_lon,lat=list_lat)
df_world = df_world.append(df_reg)
df_world.to_csv(outfile)
[docs]def add_base_to_int(df,fn_base,reg):
"""
Use "base" RGI file to check whether all glaciers are covered before aggregation, and to remove nominal glaciers and CL2 glacier estimates
:param df: DataFrame of integrated volume time series
:param fn_base: Filename of "base" RGI file
:param reg: RGI region number: 20 corresponds to Alaska and WNA combined, 21 corresponds to HMA combined (necessary for some uncertainty calculations as those are contiguous)
:returns: DataFrame of integrated volume time series with nominal, CL2 glaciers removed and not-covered glaciers added with nodata
"""
# if we can, help ourselves with pre-compiled data for all glaciers
df_base = pd.read_csv(fn_base)
# keep only glaciers from the region
if reg == 20:
ind = np.logical_or(df_base.reg == 1, df_base.reg == 2)
elif reg == 21:
ind = np.logical_or.reduce((df_base.reg == 13, df_base.reg == 14, df_base.reg == 15))
else:
ind = df_base.reg == reg
df_base_reg = df_base[ind]
# add glaciers completely omitted in this region (no data at all in the tile thus no stack: only concerns 2 tiles in Antarctica?)
proc_rgiid = list(set(list(df.rgiid)))
omit_rgiid = [rgiid for rgiid in list(df_base_reg.rgiid) if rgiid not in proc_rgiid]
print('Added ' + str(len(omit_rgiid)) + ' omitted glaciers.')
dates = list(set(list(df.time)))
for rgiid in omit_rgiid:
df_tmp = pd.DataFrame()
df_tmp = df_tmp.assign(area=[df_base_reg[df_base_reg.rgiid == rgiid].area.values[0]] * len(dates))
df_tmp['time'] = dates
df_tmp['rgiid'] = rgiid
df_tmp['dh'] = np.nan
df = df.append(df_tmp)
# remove nominal glaciers (irrelevant outlines: only North Asia, Scandinavia and Alps)
ind_nom = df_base_reg.nom == 2
nom_rgiid = list(df_base_reg.rgiid[ind_nom])
for rgiid in nom_rgiid:
df.loc[df.rgiid == rgiid,'dh'] = np.nan
print('Removed elevation change data for ' + str(len(nom_rgiid)) + ' nominal glaciers.')
# remove CL2 glaciers (Greenland)
ind_cl2 = df_base_reg.cl == 2
cl2_rgiid = list(df_base_reg.rgiid[ind_cl2])
for rgiid in cl2_rgiid:
df.loc[df.rgiid == rgiid,'area'] = np.nan
print('Removed contributing area for ' + str(len(cl2_rgiid)) + ' CL2 glaciers.')
return df
[docs]def aggregate_int_to_all(df,nproc=1,get_corr_err=True):
"""
Aggregate spatially per-glaciers volume changes time series and propagate errors
:param df: DataFrame of integrated volume time series
:param nproc: Number of cores used for multiprocessing [1]
:param get_corr_err: Derive correlated error (long): currently forced to annual only, because seasonal biases are not accounted for and can be complex
:returns: DataFrame of aggregated volume time series
"""
# get area of glacier without any data to account for it later
valid = np.logical_and.reduce((~np.isnan(df.dh), ~np.isnan(df.area),~np.isnan(df.err_dh)))
df_nodata = df[~valid]
rgiid_nodata = list(set(list(df_nodata.rgiid)))
area_nodata = 0
for rgiid in rgiid_nodata:
area_missed = df_nodata[df_nodata.rgiid == rgiid].area.values[0]
if ~np.isnan(area_missed):
area_nodata += area_missed
# keep only valid glaciers
print('Removed '+str(len(rgiid_nodata))+' non-valid glaciers with total area of '+str(area_nodata/1000000.)+' km².')
print(rgiid_nodata)
reg = df.reg.values[0]
df = df[valid]
# get the regional volume
df['dvol'] = df['dh'] * df['area']
df['var_cont'] = df['perc_err_cont'] * df['area']
df['area_res'] = df['area'] * df['perc_area_res']
df['area_meas'] = df['area'] * df['perc_area_meas']
df['area_valid_obs'] = df['area'] * df['valid_obs']
df['area_valid_obs_py'] = df['area'] * df['valid_obs_py']
df_tot = df.groupby('time')['dvol', 'area', 'var_cont', 'area_meas', 'area_res', 'area_valid_obs','area_valid_obs_py'].sum()
# propagate volume change accouting for area of no data glaciers
df_tot['dvol'] *= (df_tot['area'] + area_nodata) / df_tot['area']
df_tot['perc_err_cont'] = df_tot['var_cont'] / df_tot['area']
df_tot['valid_obs'] = df_tot['area_valid_obs'] / df_tot['area']
df_tot['valid_obs_py'] = df_tot['area_valid_obs_py'] / df_tot['area']
df_tot['area'] += area_nodata
df_tot['perc_area_meas'] = df_tot['area_meas'] / df_tot['area']
df_tot['perc_area_res'] = df_tot['area_res'] / df_tot['area']
df_tot['dh'] = df_tot['dvol'] / df_tot['area']
df_tot['area_nodata'] = area_nodata
df_tot['reg'] = reg
df_tot = df_tot.drop(columns=['var_cont', 'area_meas', 'area_res', 'area_valid_obs'])
# integrate elevation change error accouting for spatial correlation of temporal interpolation between glaciers
if np.count_nonzero(valid) == 0:
return df_tot
# get error only for annual
tlim = [np.datetime64(str(2000 + i) + '-01-01') for i in range(21)]
times = sorted(list(set(list(df['time']))))
df_time = pd.DataFrame()
df_time = df_time.assign(time=times)
df_time.index = pd.DatetimeIndex(pd.to_datetime(times))
closest_dates = []
for tl in tlim:
time_clos = df_time.iloc[df_time.index.get_loc(pd.to_datetime(tl), method='nearest')][0]
closest_dates.append(time_clos)
# int_err = []
df_tot['err_dh'] = np.nan
if get_corr_err:
for time in closest_dates:
print('Time step: ' + time)
df_tmp = df[df.time == time]
corr_ranges = [150, 2000, 5000, 20000, 50000, 200000]
list_tuple_errs = list(
zip(*[df_tmp['err_corr_' + str(corr_ranges[i])].values for i in range(len(corr_ranges))]))
list_area_tot = df_tmp.area.values
list_lat = df_tmp.lat.values
list_lon = df_tmp.lon.values
err = spt.double_sum_covar(list_tuple_errs, corr_ranges, list_area_tot, list_lat, list_lon, nproc=nproc)
df_tot.loc[df_tot.index == time,'err_dh'] = err
# int_err = np.array(int_err)
# df_tot['err_dh'] = int_err
# error on general inventory, uncharted glaciers ; 3 percent here
# df_tot['err_area'] = 3./100.*df_tot['area']
# propagate error to volume change
# df_tot['err_dvol'] = np.sqrt((df_tot['err_dh']*df_tot['area'])**2 + (df_tot['dh']*np.sqrt(df_tot['err_area']**2+df_tot['err_cont']**2))**2)
df_tot['err_dvol'] = np.sqrt((df_tot['err_dh'] * df_tot['area']) ** 2 + (
df_tot['dh'] * df_tot['perc_err_cont'] / 100. * df_tot['area']) ** 2)
# convert to mass change (Huss, 2013)
df_tot['dm'] = df_tot['dvol'] * 0.85 / 10 ** 9
# propagate error to mass change (Huss, 2013)
df_tot['err_dm'] = np.sqrt((df_tot['err_dvol'] * 0.85 / 10 ** 9) ** 2 + (
df_tot['dvol'] * 0.06 / 10 ** 9) ** 2)
return df_tot
[docs]def df_int_to_base(infile,fn_base=None):
"""
Wrapper for adding base RGI info to DataFrame per glacier to all regions separately and write to .csv
:param infile: DataFrame of integrated volume time series
:param fn_base: Filename of "base" RGI file
:returns:
"""
df_init = pd.read_csv(infile)
print('Working on ' + infile)
region = int(os.path.basename(infile).split('_')[1])
# to integrate properly regions 1, 2, 13, 14, 15
if region == 1:
list_reg = [20, 1, 2]
elif region == 13:
list_reg = [21, 13, 14, 15]
else:
list_reg = [region]
for reg in list_reg:
print('Processing for region ' + str(reg))
if reg > 19:
fn_base_out = os.path.join(os.path.dirname(infile),
os.path.splitext(os.path.basename(infile))[0] + '_base.csv')
df = df_init
else:
fn_base_out = os.path.join(os.path.dirname(infile), 'dh_' + str(reg).zfill(2) + '_rgi60_int_base.csv')
if reg in [1, 2, 13, 14, 15]:
ind = np.array(['RGI60-' + str(reg).zfill(2) in rgiid for rgiid in list(df_init.rgiid.values)])
df = df_init[ind]
else:
df = df_init
df['reg'] = reg
if fn_base is not None:
df = add_base_to_int(df, fn_base, reg)
# if not os.path.exists(fn_base_out):
df.to_csv(fn_base_out)
[docs]def df_int_to_reg(infile,nproc=1):
"""
Wrapper for integrating volume change time series per region (based on RGI number) and write to .csv
:param infile: DataFrame of integrated volume time series
:param nproc: Number of cores used for multiprocessing [1]
:returns:
"""
df = pd.read_csv(infile)
fn_reg_out = os.path.join(os.path.dirname(infile),
os.path.splitext(os.path.basename(infile))[0] + '_reg.csv')
#let's go
df_tot = aggregate_int_to_all(df,nproc=nproc)
df_tot.to_csv(fn_reg_out)
[docs]def aggregate_all_to_period(df,list_tlim=None,mult_ann=None,fn_tarea=None,frac_area=None,list_tag=None):
"""
Temporal integration of volume change time series into rates for a specific period and error propagation
For regions, we can account for time-varying areas by approximated per-region annual areas (Zemp et al. (2019))
:param df: DataFrame of volume time series
:param list_tlim: List of tuples or early and late date to integrate over
:param mult_ann: Ignore tuples, integrate over all successive multi-annual periods of this length (e.g., four successive 5-year rates)
:param fn_tarea: Filename of DataFrame with temporally-varying areas
:param frac_area: Use only a corresponding fraction of the regional area to estimate time-varying areas (e.g., subset of Greenland, or other)
:param list_tag: Tag a naming to each integration period
:returns: DataFrame of volume change rates for all successive periods fitting the year length
"""
if np.count_nonzero(~np.isnan(df.dh))==0:
df_mult_ann = pd.DataFrame()
return df_mult_ann
if list_tlim is None and mult_ann is None:
print('Exiting: need to provide either a time interval (tlim) or a year length (mult_ann)')
if mult_ann is not None:
print('Multi-annual: ignoring list of time periods')
nb_period = int(np.floor(20 / mult_ann))
list_tlim = [(np.datetime64(str(2000 + mult_ann * i) + '-01-01'),np.datetime64(str(2000 + mult_ann * (i+1)) + '-01-01')) for i in range(nb_period)]
list_closest_dates = []
times = sorted(list(set(list(df['time']))))
df_time = pd.DataFrame()
df_time = df_time.assign(time=times)
df_time.index = pd.DatetimeIndex(pd.to_datetime(times))
for tl in list_tlim:
early_time_clos = df_time.iloc[df_time.index.get_loc(pd.to_datetime(tl[0]), method='nearest')][0]
late_time_clos = df_time.iloc[df_time.index.get_loc(pd.to_datetime(tl[1]), method='nearest')][0]
list_closest_dates.append((early_time_clos,late_time_clos))
reg = df.reg.values[0]
if fn_tarea is not None:
df_tarea = pd.read_csv(fn_tarea)
# get regionally evolving areas
if reg == 20:
tmp_tarea = df_tarea['RGI1'].values + df_tarea['RGI2'].values
elif reg == 21:
tmp_tarea = df_tarea['RGI13'].values + df_tarea['RGI14'].values + df_tarea['RGI15'].values
else:
tmp_tarea = df_tarea['RGI' + str(int(reg))].values
if frac_area is not None:
tmp_tarea = frac_area * tmp_tarea
#list corresponding to tuples of early and late date (to later derive mean area of two periods)
list_tarea = []
for i in range(len(list_tlim)):
# getting years 2000 to 2020
ind = df_tarea['YEAR'] == list_tlim[i][0].astype(object).year
early_tarea = tmp_tarea[ind][0] * 1000000
ind = df_tarea['YEAR'] == list_tlim[i][1].astype(object).year
late_tarea = tmp_tarea[ind][0] * 1000000
list_tarea.append((early_tarea,late_tarea))
else:
list_tarea =[(df.area.values[0],df.area.values[0]) for i in range(len(list_tlim))]
list_midtarea, list_dhdt, list_err_dhdt, list_dvoldt, list_err_dvoldt, list_dmdt, list_err_dmdt, list_valid_obs, list_dt, list_valid_obs_py, list_dmdtda, list_err_dmdtda = (
[] for i in range(12))
for i in range(len(list_tlim)):
# derive volume change for subperiod
tlim = list_tlim[i]
mult_ann = tlim[1].astype(object).year - tlim[0].astype(object).year
area = df.area.values[0]
dvol = (df[df.time == list_closest_dates[i][1]].dvol.values - df[df.time == list_closest_dates[i][0]].dvol.values)[0]
dh = dvol / area
err_dh = np.sqrt(
df[df.time == list_closest_dates[i][1]].err_dh.values[0] ** 2 + df[df.time == list_closest_dates[i][0]].err_dh.values[
0] ** 2)
err_dvol = np.sqrt((err_dh * area) ** 2 + (dh * df.perc_err_cont.values[0] / 100. * area) ** 2)
dvoldt = dvol / mult_ann
err_dvoldt = err_dvol / mult_ann
dmdt = dvol * 0.85 / 10 ** 9 / mult_ann
err_dmdt = np.sqrt((err_dvol * 0.85 / 10 ** 9) ** 2 + (
dvol * 0.06 / 10 ** 9) ** 2) / mult_ann
linear_area = (list_tarea[i][0] + list_tarea[i][1]) / 2
dhdt = dvol / linear_area / mult_ann
perc_err_linear_area = 1. / 100
err_dhdt = np.sqrt((err_dvol / linear_area) ** 2 \
+ (perc_err_linear_area * linear_area * dvol / linear_area ** 2) ** 2) / mult_ann
dmdtda = dmdt / linear_area * 10**9
err_dmdtda = np.sqrt((err_dmdt * 10**9 / linear_area) ** 2 \
+ (perc_err_linear_area * linear_area * dmdt * 10**9/ linear_area ** 2) ** 2)
ind = np.logical_and(df.time >= list_closest_dates[i][0], df.time < list_closest_dates[i][1])
valid_obs = np.nansum(df.valid_obs[ind].values)
valid_obs_py = np.nansum(df.valid_obs_py[ind].values)
list_midtarea.append(linear_area)
list_dhdt.append(dhdt)
list_err_dhdt.append(err_dhdt)
list_dvoldt.append(dvoldt)
list_err_dvoldt.append(err_dvoldt)
list_dmdt.append(dmdt)
list_err_dmdt.append(err_dmdt)
list_dmdtda.append(dmdtda)
list_err_dmdtda.append(err_dmdtda)
list_dt.append(str(tlim[0]) + '_' + str(tlim[1]))
list_valid_obs.append(valid_obs)
list_valid_obs_py.append(valid_obs_py)
df_mult_ann = pd.DataFrame()
df_mult_ann = df_mult_ann.assign(period=list_dt, tarea=list_midtarea, dhdt=list_dhdt, err_dhdt=list_err_dhdt,
dvoldt=list_dvoldt, err_dvoldt=list_err_dvoldt, dmdt=list_dmdt,
err_dmdt=list_err_dmdt, dmdtda=list_dmdtda,err_dmdtda=list_err_dmdtda,valid_obs=list_valid_obs, valid_obs_py=list_valid_obs_py)
if list_tag is not None:
df_mult_ann['tag'] = list_tag
df_mult_ann['reg'] = reg
df_mult_ann['perc_area_meas'] = df.perc_area_meas.values[0]
df_mult_ann['perc_area_res'] = df.perc_area_res.values[0]
df_mult_ann['area_nodata'] = df.area_nodata.values[0]
df_mult_ann['area'] = df.area.values[0]
return df_mult_ann
[docs]def aggregate_indep_regions(df_p):
"""
Aggregate regions with independent assumptions for uncertainty propagation
:param df_p:
:return:
"""
# GLOBAL TOTAL
area_global = np.nansum(df_p.area.values)
area_nodata_global = np.nansum(df_p.area_nodata.values)
tarea_global = np.nansum(df_p.tarea.values)
dmdt_global = np.nansum(df_p.dmdt.values)
err_dmdt_global = np.sqrt(np.nansum(df_p.err_dmdt.values ** 2))
dvoldt_global = np.nansum(df_p.dvoldt.values)
err_dvoldt_global = np.sqrt(np.nansum(df_p.err_dvoldt.values ** 2))
err_tarea_global = np.sqrt(np.nansum((1 / 100. * df_p.tarea.values) ** 2))
dhdt_global = np.nansum(df_p.dvoldt.values) / tarea_global
err_dhdt_global = np.sqrt(
(err_dvoldt_global / tarea_global) ** 2 + (err_tarea_global * dvoldt_global / (tarea_global ** 2)) ** 2)
dmdtda_global = dmdt_global * 10**9/tarea_global
err_dmdtda_global = np.sqrt(
(err_dmdt_global *10**9/ tarea_global) ** 2 + (err_tarea_global * dmdt_global*10**9 / (tarea_global ** 2)) ** 2)
perc_area_res_global = np.nansum(df_p.perc_area_res.values * df_p.area.values) / np.nansum(
df_p.perc_area_res.values)
perc_area_meas_global = np.nansum(df_p.perc_area_meas.values * df_p.area.values) / np.nansum(
df_p.perc_area_meas.values)
valid_obs_global = np.nansum(df_p.valid_obs.values * df_p.area.values) / np.nansum(df_p.perc_area_res.values)
valid_obs_py_global = np.nansum(df_p.valid_obs_py.values * df_p.area.values) / np.nansum(df_p.perc_area_res.values)
df_sum = pd.DataFrame()
df_sum = df_sum.assign(area=[area_global],area_nodata=[area_nodata_global],dhdt=[dhdt_global],err_dhdt=[err_dhdt_global],dvoldt=[dvoldt_global],err_dvoldt=[err_dvoldt_global]
,dmdt=[dmdt_global],err_dmdt=[err_dmdt_global],dmdtda=[dmdtda_global],err_dmdtda=[err_dmdtda_global],tarea=[tarea_global],perc_area_res=[perc_area_res_global],perc_area_meas=[perc_area_meas_global],valid_obs=[valid_obs_global]
,valid_obs_py=[valid_obs_py_global])
return df_sum
[docs]def df_region_to_multann(infile_reg, fn_tarea=None, frac_area=None):
"""
Wrapper for temporal integration of regional volume change time series for all possible 1-,2-,4-,5-,10- and 20-year periods and write to .csv
:param infile_reg: DataFrame of regional, integrated volume time series
:param fn_tarea: Filename of DataFrame with temporally-varying areas
:param frac_area: Use only a corresponding fraction of the regional area to estimate time-varying areas (e.g., subset of Greenland, or other)
:returns:
"""
df = pd.read_csv(infile_reg)
fn_out = os.path.join(os.path.dirname(infile_reg),
os.path.splitext(os.path.basename(infile_reg))[0] + '_subperiods.csv')
list_df = []
for mult_ann in [1,2,4,5,10,20]:
df_mult_ann = aggregate_all_to_period(df,mult_ann=mult_ann,fn_tarea=fn_tarea,frac_area=frac_area)
list_df.append(df_mult_ann)
df_final = pd.concat(list_df)
df_final.to_csv(fn_out)
def wrapper_tile_rgiids(argsin):
list_tile, df_base, uniq_rgiids, i, itot = argsin
print('Working on pack: '+str(i)+' out of '+str(itot))
tmp_rgiids = []
for j, tile in enumerate(list_tile):
print('Tile: '+str(j)+' out of '+str(len(list_tile)))
ind_base = np.logical_and.reduce((df_base.lat >= tile[1], df_base.lat < tile[3], df_base.lon >= tile[0], df_base.lon < tile[2]))
rgiids_base = list(df_base[ind_base].rgiid)
if len(rgiids_base)==0:
tmp_rgiids.append(['NA'])
print('No base glacier in this tile, skipping...')
continue
#this is only necessary is the few void glaciers have not been added to the dataframe post volume integ
rgiids = [rgiid for rgiid in rgiids_base if rgiid in uniq_rgiids]
if len(rgiids) == 0:
tmp_rgiids.append(['NA'])
print('No glacier in this tile, skipping...')
continue
tmp_rgiids.append(rgiids)
return tmp_rgiids
def wrapper_tile_int_to_all_to_mult_ann(argsin):
df_tile, tile, i, imax = argsin
# df_tile = df_keep[df_keep.rgiid.isin(rgiids_final)]
print('Working on tile '+str(tile)+': '+str(i)+' out of '+str(imax))
df_agg = aggregate_int_to_all(df_tile, nproc=1)
df_agg['time'] = df_agg.index.values
list_df_mult = []
for mult_ann in [1, 2, 4, 5, 10, 20]:
#TODO: for specific rates at smaller scales than region, we don't assume time-varying areas yet... somewhat inconsistent
df_mult = aggregate_all_to_period(df_agg, mult_ann=mult_ann)
list_df_mult.append(df_mult)
df_mult_all = pd.concat(list_df_mult)
df_mult_all['tile_lonmin'] = tile[0]
df_mult_all['tile_latmin'] = tile[1]
return df_mult_all
[docs]def df_all_base_to_tile(list_fn_int_base,fn_base,tile_size=1,nproc=1,sort_tw=True):
"""
Integrate all per-glacier volume change time series on a global tiling of certain size, for all possible 1-,2-,4-,5-,10- and 20-year periods, with error propagation
:param list_fn_int_base: List of filename of DataFrame of per-glacier volume change time series with base RGIID info added
:param fn_base: Filename of "base" RGI file
:param tile_size: Size of tiling in lat/lon degrees
:param nproc: Number of cores used for multiprocessing [1]
:param sort_tw: Sort between marine-terminating and land-terminating glaciers
:returns:
"""
fn_out = os.path.join(os.path.dirname(list_fn_int_base[0]), 'dh_world_tiles_' + str(tile_size) + 'deg.csv')
def world_latlon_tiles(deg_size):
list_tile = []
for ymin in np.arange(-90, 90, deg_size):
for xmin in np.arange(-180, 180, deg_size):
list_tile.append(np.array([xmin, ymin, xmin + deg_size, ymin + deg_size], dtype=float))
return list_tile
list_tile = world_latlon_tiles(tile_size)
# list_tile = [np.array([-180,70,-179,71],dtype=float),np.array([-18,64,-17,65],dtype=float),np.array([-17,64,-16,65],dtype=float)]
df_base = pd.read_csv(fn_base)
if sort_tw:
ind_tw = df_base.term == 1
keep_tw = list(df_base.rgiid[ind_tw])
keep_ntw = list(df_base.rgiid[~ind_tw])
list_keeps = [keep_tw,keep_ntw,list(df_base.rgiid)]
name_keeps = ['tw','ntw','all']
else:
list_keeps=[list(df_base.rgiid)]
name_keeps=['all']
list_df_final = []
# list_dfs = []
for fn_int_base in list_fn_int_base:
df = pd.read_csv(fn_int_base)
# list_dfs.append(pd.read_csv(fn_int_base))
# df = pd.concat(list_dfs)
for keep in list_keeps:
print('Processing for category: '+name_keeps[list_keeps.index(keep)])
df_keep=df[df.rgiid.isin(keep)]
if len(df_keep)==0:
continue
r = int(os.path.basename(fn_int_base).split('_')[1])
if r == 1:
list_reg = [1,2]
elif r == 13:
list_reg = [13,14,15]
else:
list_reg = [r]
list_ext_reg = []
# list_reg = list(set(list(df_base.reg)))
print('Deriving region extents for tile omission...')
for reg in list_reg:
print('Region: '+str(reg))
df_tmp = df_base[df_base.reg==reg]
list_ext_reg.append(np.array([df_tmp.lon.min(),df_tmp.lat.min(),df_tmp.lon.max(),df_tmp.lat.max()],dtype=float))
uniq_rgiids = list(set(list(df_keep.rgiid)))
print('Removing useless tiles')
list_tile_possib = []
for tile in list_tile:
# to speed things up, let's remove all tiles outside region extents
poly_tile = vt.poly_from_extent(tile)
chk = 0
for reg in list_reg:
poly = vt.poly_from_extent(list_ext_reg[list_reg.index(reg)])
if poly.Intersects(poly_tile):
chk = 1
if chk != 0:
list_tile_possib.append(tile)
print('Finding which tiles have to be processed...')
if nproc==1:
print('Using 1 core...')
list_rgiids = []
for i in range(len(list_tile_possib)):
tmp_rgiids = wrapper_tile_rgiids((list_tile_possib[i], df_base, uniq_rgiids, i, len(list_tile_possib)))
list_rgiids.append(tmp_rgiids)
else:
print('Using '+str(nproc)+' cores...')
pack_size = int(np.ceil(len(list_tile_possib) / nproc))
argsin = [(list_tile_possib[i:min(i+pack_size,len(list_tile_possib))], df_base, uniq_rgiids, k, nproc) for k ,i in enumerate(np.arange(0,len(list_tile_possib),pack_size))]
pool = mp.Pool(nproc, maxtasksperchild=1)
outputs = pool.map(wrapper_tile_rgiids, argsin, chunksize=1)
pool.close()
pool.join()
list_rgiids = list(itertools.chain(*outputs))
list_df_tile = []
list_tile_final = []
# list_rgiids_final = []
for i, rgiids in enumerate(list_rgiids):
if len(rgiids)==1 and rgiids[0]=='NA':
continue
# list_rgiids_final.append(rgiids)
df_tile = df_keep[df_keep.rgiid.isin(rgiids)]
list_df_tile.append(df_tile)
list_tile_final.append(list_tile_possib[i])
print('Found '+str(len(list_tile_final))+' valid tiles to integrate.')
print('Integrating...')
if nproc == 1:
print('Using 1 core...')
list_mult_all = []
for i, tile_final in enumerate(list_tile_final):
df_mult_all = wrapper_tile_int_to_all_to_mult_ann((list_df_tile[i],tile_final,i,len(list_tile_final)))
list_mult_all.append(df_mult_all)
else:
print('Using '+str(nproc)+' cores...')
argsin = [(list_df_tile[i],list_tile_final[i],i,len(list_tile_final)) for i in range(len(list_tile_final))]
pool = mp.Pool(nproc, maxtasksperchild=1)
outputs = pool.map(wrapper_tile_int_to_all_to_mult_ann, argsin, chunksize=1)
pool.close()
pool.join()
list_mult_all = outputs
if len(list_mult_all)>0:
df_tiles = pd.concat(list_mult_all)
df_tiles['tile_size'] = tile_size
df_tiles['category'] = name_keeps[list_keeps.index(keep)]
df_tiles['reg'] = os.path.basename(fn_int_base).split('_')[1]
list_df_final.append(df_tiles)
df_cat = pd.concat(list_df_final)
df_cat.to_csv(fn_out)
[docs]def aggregate_df_int_time(infile,tlim=None,rate=False):
"""
Temporal integration over any period, choice between cumulative or rate
:param infile: Filename of DataFrame with volume change time series
:param tlim: Time limits for integration
:param rate: Boolean, add rates
:returns:
"""
df = pd.read_csv(infile)
#not the cleanest closest time search you can write, but works
times = sorted(list(set(list(df['time']))))
df_time = pd.DataFrame()
df_time = df_time.assign(time=times)
df_time.index = pd.DatetimeIndex(pd.to_datetime(times))
time_start = df_time.iloc[df_time.index.get_loc(pd.to_datetime(tlim[0]),method='nearest')][0]
time_end = df_time.iloc[df_time.index.get_loc(pd.to_datetime(tlim[1]),method='nearest')][0]
df = df.sort_values(by='rgiid')
df_start = df[df.time == time_start]
df_end = df[df.time == time_end]
ind = np.logical_and(df.time >= time_start, df.time < time_end)
df_period = df[ind]
df_period_cum = df_period.groupby('rgiid')['valid_obs',].sum()
df_gla = pd.DataFrame()
df_gla['rgiid'] = df_start['rgiid']
df_gla['valid_obs'] = df_period_cum.valid_obs.values
df_gla['period'] = str(time_start)+'_'+str(time_end)
df_gla['area'] = df_start['area']
df_gla['perc_area_meas'] = df_start['perc_area_meas']
df_gla['lat'] = df_start['lat']
df_gla['lon'] = df_start['lon']
df_gla['dh'] = df_end['dh'].values - df_start['dh'].values
df_gla['err_dh'] = np.sqrt(df_end['err_dh'].values**2+df_start['err_dh'].values**2)
df_gla['err_cont'] = df_start['perc_err_cont'] * df_start['area'] / 100.
df_gla['perc_err_cont']=df_start['perc_err_cont']
#get the volume per glacier
df_gla['dvol'] = df_gla['dh'] * df_gla['area']
#correct systematic seasonal biases (snow-covered terrain)
# df_tot['dh'] =
#propagate error to volume change
df_gla['err_dvol'] = np.sqrt((df_gla['err_dh'] * df_gla['area']) ** 2 + (
df_gla['dh'] * df_gla['perc_err_cont'] / 100. * df_gla['area']) ** 2)
#convert to mass change (Huss, 2013)
df_gla['dm'] = df_gla['dvol'] * 0.85 / 10 ** 9
#propagate error to mass change (Huss, 2013)
df_gla['err_dm'] = np.sqrt((df_gla['err_dvol'] * 0.85 / 10 ** 9) ** 2 + (
df_gla['dvol'] * 0.06 / 10 ** 9) ** 2)
df_gla['dmda'] = df_gla['dm'].values / df_gla['area'].values * 10 ** 9
df_gla['err_dmda'] = np.sqrt((df_gla['err_dm'].values * 10 ** 9 / df_gla['area'].values) ** 2 \
+ (df_gla['err_cont'].values * df_gla['dm'].values * 10 ** 9 / df_gla['area'].values ** 2) ** 2)
if rate:
dt = (tlim[1]-tlim[0]).astype(int)/365.2524
df_gla['dmdtda'] = df_gla['dmda'] / dt
df_gla['err_dmdtda'] = df_gla['err_dmda'] /dt
df_gla['dhdt'] = df_gla['dh']/dt
df_gla['err_dhdt'] = df_gla['err_dh'] /dt
df_gla = df_gla.drop(columns=['dvol', 'err_dvol', 'dm', 'err_dm','dh','err_dh','dmda','err_dmda'])
else:
df_gla = df_gla.drop(columns=['dvol', 'err_dvol', 'dm', 'err_dm','dh','err_dh'])
df_gla['area'] = df_gla['area']/1000000
df_gla['reg'] = df_start['reg']
return df_gla
def aggregate_int_to_shp(df,fn_shp,list_tlim=None,mult_ann=None,field_name='FULL_NAME',code_name='RGI_CODE',sort_tw=False,fn_base=None,fn_tarea=None):
ds_shp_in = ogr.GetDriverByName('ESRI Shapefile').Open(fn_shp, 0)
layer_in = ds_shp_in.GetLayer()
if fn_base is not None:
df_base = pd.read_csv(fn_base)
list_df_all = []
for feature in layer_in:
geom = feature.GetGeometryRef()
region = feature.GetField(field_name)
rgi_code = feature.GetField(code_name)
centroid = geom.Centroid()
center_lon, center_lat, _ = centroid.GetPoint()
# list_rgiid = list(set(list(df.rgiid)))
# keep only points in polygon
df_rgiid = df.groupby('rgiid')['lat', 'lon'].mean()
df_rgiid['rgiid'] = df_rgiid.index
list_subreg_rgiid = []
for i in np.arange(len(df_rgiid)):
print('Working on glacier ' + str(i + 1) + ' out of ' + str(len(df_rgiid)))
lat = df_rgiid.lat.values[i]
lon = df_rgiid.lon.values[i]
point = ogr.Geometry(ogr.wkbPoint)
point.AddPoint(lon, lat)
if point.Intersects(geom):
list_subreg_rgiid.append(df_rgiid.rgiid.values[i])
if sort_tw:
ind_tw = np.logical_or(df_base.term == 1, df_base.term == 5)
keep_tw = list(df_base.rgiid[ind_tw])
keep_ntw = list(df_base.rgiid[~ind_tw])
list_keeps = [keep_tw, keep_ntw, list(df_base.rgiid)]
name_keeps = ['tw', 'ntw', 'all']
else:
list_keeps = [list_subreg_rgiid]
name_keeps = ['all']
for keeps in list_keeps:
subset = [rgiid for rgiid in list_subreg_rgiid if rgiid in keeps]
df_subreg_int = df[df.rgiid.isin(subset)]
if len(df_subreg_int) == 0:
continue
df_subreg_reg = aggregate_int_to_all(df_subreg_int, get_corr_err=True)
df_subreg_reg['time'] = df_subreg_reg.index.values
if len(df_subreg_reg) == 0:
continue
subreg_area = df_subreg_reg.area.values[0]
# frac_area = subreg_area / regional_area
frac_area = None
list_df_mult = []
for mult_ann in [1, 2, 4, 5, 10, 20]:
df_mult = aggregate_all_to_period(df_subreg_reg, list_tlim=list_tlim, mult_ann=mult_ann, fn_tarea=fn_tarea,
frac_area=frac_area)
list_df_mult.append(df_mult)
df_mult_all = pd.concat(list_df_mult)
df_mult_all['subreg'] = region
df_mult_all['lon_reg'] = center_lon
df_mult_all['lat_reg'] = center_lat
df_mult_all['category'] = name_keeps[list_keeps.index(keeps)]
df_mult_all['frac_area'] = frac_area
list_df_all.append(df_mult_all)
df_in_shp = pd.concat(list_df_all)
return df_in_shp