OB.DAAC Logo
NASA Logo
Ocean Color Science Software

ocssw V2022
utils.py
Go to the documentation of this file.
1 from scipy.spatial.distance import pdist, squareform
2 
3 import pandas as pd
4 import numpy as np
5 
6 
7 def get_unique_features(features, threshold=0.025, chunksize=10, _bottom=False):
8  ''' Mask out features which are near duplicates of some other
9  feature(s), based on correlation. The threshold parameter
10  determines how similar features can be to keep them.
11  Roughly based on https://stackoverflow.com/a/66326102
12  '''
13  # To make this more efficient, break into chunks and filter those chunks first
14  if (features.shape[1] > chunksize) and not _bottom:
15  '''
16  There are two options, which boils down a question of how information
17  propagates between chunks:
18  1. Binary-split recursion down to the chunksize. This will filter
19  features more towards the bottom of the tree, assuming it's
20  more likely for similar features to be near each other
21 
22  2. Single level tree, with (nearly) all nodes having the same number
23  of features (i.e. chunksize). This performs most filtering at the
24  root of the tree, when all nodes are combined after the initial filter
25 
26  Paradoxically, option 1 appears to result in a higher number of final
27  features returned in spite of applying more filters. This could be a
28  consequence of chained features, e.g. features A and B have a distance
29  of 0.015, B and C have a distance of 0.015, and A and C have 0.03; with
30  a threshold of 0.025, B and C would both be removed if all three were within
31  the same filter. If instead A and B were in one filter (with B then
32  removed), but C was in another, then the final result would be A and C
33  rather than just A.
34 
35  These chains (referred to as "Hamming chains" in the stackoverflow post)
36  can be arbitrarily long, and breaking them should intuitively offer
37  a better representation for ML model features. Option 1 therefore seems
38  to be the better choice, as well as using the smallest chunksize possible -
39  but note that this hasn't been rigorously tested.
40  '''
41  recursion_size = features.shape[-1] // 2 # Option 1 - binary recursion
42  # recursion_size = chunksize # Option 2 - flattened recursion
43 
44  mask = np.concatenate([
45  get_unique_features(features[:, i: i+recursion_size], threshold, chunksize)
46  for i in range(0, features.shape[1], recursion_size)
47  ])
48  mask[mask] = get_unique_features(features[:, mask], threshold, chunksize, True)
49  return mask
50 
51  dist = squareform( pdist(features.T, metric='correlation') )
52  dist[np.triu_indices_from(dist)] = -1
53  mask = (0 <= dist) & (dist <= threshold)
54  return ~np.any(mask, axis=1)
55 
56 
57 
58 def spearmanr(x, y):
59  ''' Scipy's provided spearmanr is too slow for large matrices, and so we instead use a custom implementation.
60  Source: https://stackoverflow.com/questions/52371329/fast-spearman-correlation-between-two-pandas-dataframes/59072032#59072032
61  '''
62  def rankdata_average(data):
63  ''' Row-based rankdata using method=mean '''
64  dc = np.asarray(data).copy()
65  sorter = np.apply_along_axis(np.argsort, 1, data)
66 
67  inv = np.empty(data.shape, np.intp)
68  ranks = np.tile(np.arange(data.shape[1]), (len(data), 1))
69  np.put_along_axis(inv, sorter, ranks, axis=1)
70 
71  dc = np.take_along_axis(dc, sorter, 1)
72  res = np.apply_along_axis(lambda r: r[1:] != r[:-1], 1, dc)
73  obs = np.column_stack([np.ones(len(res), dtype=bool), res])
74 
75  dense = np.take_along_axis(np.apply_along_axis(np.cumsum, 1, obs), inv, 1)
76  len_r = obs.shape[1]
77 
78  nonzero = np.count_nonzero(obs, axis=1)
79  nonzero = pd.Series(nonzero)
80  dense = pd.DataFrame(dense)
81  obs = pd.DataFrame(obs)
82 
83  ranks = []
84  for _nonzero, nzdf in obs.groupby(nonzero, sort=False):
85  nz = np.apply_along_axis(lambda r: np.nonzero(r)[0], 1, nzdf)
86 
87  _count = np.column_stack([nz, np.ones(len(nz)) * len_r])
88  _dense = dense.reindex(nzdf.index).values
89 
90  _result = 0.5 * (np.take_along_axis(_count, _dense, 1) + np.take_along_axis(_count, _dense - 1, 1) + 1)
91  result = pd.DataFrame(_result, index=nzdf.index)
92  ranks.append(result)
93  return pd.concat(ranks).sort_index()
94 
95 
96  def compute_corr(x, y):
97  # Thanks to https://github.com/dengemann
98  def ss(a, axis):
99  return np.sum(a * a, axis=axis)
100 
101  x = np.asarray(x)
102  y = np.asarray(y)
103 
104  mx = x.mean(axis=-1)
105  my = y.mean(axis=-1)
106 
107  xm, ym = x - mx[..., None], y - my[..., None]
108 
109  r_num = np.add.reduce(xm * ym, axis=-1)
110  r_den = np.sqrt(ss(xm, axis=-1) * ss(ym, axis=-1))
111 
112  with np.errstate(divide='ignore', invalid="ignore"):
113  r = r_num / r_den
114  return r
115 
116 
117  x = np.asarray(x)
118  y = np.asarray(y)
119 
120  rx = rankdata_average(x)
121  ry = rankdata_average(y)
122  return compute_corr(rx, ry)
def get_unique_features(features, threshold=0.025, chunksize=10, _bottom=False)
Definition: utils.py:7
void copy(double **aout, double **ain, int n)
def spearmanr(x, y)
Definition: utils.py:58