__all__ = ['extract', 'run_cli']
import argparse
import os
import sys
from collections.abc import Sequence
import traceback
import warnings
import logging
import networkx as nx
import numpy as np
from os.path import join as pjoin, exists as pexists
from sys import version_info
if version_info.major > 2:
from hiwenet import more_metrics, non_pairwise
from hiwenet.utils import compute_histogram, HiwenetWarning
else:
raise NotImplementedError('hiwenet supports only Python 3 or higher!')
list_medpy_histogram_metrics = np.array([
'chebyshev', 'chebyshev_neg', 'chi_square',
'correlate', 'correlate_1',
'cosine', 'cosine_1', 'cosine_2', 'cosine_alt',
'euclidean', 'fidelity_based',
'histogram_intersection', 'histogram_intersection_1',
'jensen_shannon', 'kullback_leibler', 'manhattan', 'minowski',
'noelle_1', 'noelle_2', 'noelle_3', 'noelle_4', 'noelle_5',
'relative_bin_deviation', 'relative_deviation'])
metric_list = [
'manhattan', 'minowski', 'euclidean',
'noelle_2', 'noelle_4', 'noelle_5']
unknown_prop_list = ['histogram_intersection']
still_under_dev = ['quadratic_forms']
similarity_func = ['kullback_leibler', 'cosine_1', 'correlate',
'cosine', 'cosine_2', 'cosine_alt', 'fidelity_based']
semi_metric_list = [
'jensen_shannon', 'chi_square',
'chebyshev', 'chebyshev_neg',
'histogram_intersection_1',
'relative_deviation', 'relative_bin_deviation',
'noelle_1', 'noelle_3',
'correlate_1']
metrics_on_original_features = ['diff_medians', 'diff_medians_abs',
'diff_means', 'diff_means_abs',
'exp_diff_means_norm_std', 'ranksum_statistic']
list_hiwenet_histogram_metrics = np.array(['sorensen_dist', ])
symmetric_metrics_on_original_features = ['diff_medians_abs', 'diff_means_abs']
minimum_num_bins = 5
default_weight_method = 'manhattan'
default_num_bins = 25
default_edge_range = None
default_trim_percentile = 5
default_trim_behaviour = True
default_return_networkx_graph = False
default_out_weights_path = None
def compute_bin_edges(features, num_bins, edge_range, trim_outliers, trim_percentile,
use_orig_distr=False):
"Compute the edges for the histogram bins to keep it the same for all nodes."
if use_orig_distr:
print('Using original distribution (without histogram) to compute edge weights!')
edges=None
return edges
if edge_range is None:
if trim_outliers:
# percentiles_to_keep = [ trim_percentile, 1.0-trim_percentile] # [0.05, 0.95]
edges_of_edges = np.array([np.percentile(features, trim_percentile),
np.percentile(features, 100 - trim_percentile)])
else:
edges_of_edges = np.array([np.min(features), np.max(features)])
else:
edges_of_edges = edge_range
# Edges computed using data from all nodes, in order to establish correspondence
edges = np.linspace(edges_of_edges[0], edges_of_edges[1], num=num_bins, endpoint=True)
return edges
def pairwise_extract(features, groups, edges, weight_func, use_orig_distr,
group_ids, num_groups, num_links,
non_symmetric, return_networkx_graph, out_weights_path):
"""
Core function to compute the pair-wise histogram distance between all ROIs.
Parameters
----------
features
groups
edges
weight_func
use_orig_distr
group_ids
num_groups
num_links
non_symmetric
return_networkx_graph
out_weights_path
Returns
-------
result : object
A networkx graph or numpy array depending on request
"""
# the following will execute only when the pair-wise computation is requested.
if return_networkx_graph:
graph = nx.DiGraph() if non_symmetric else nx.Graph()
graph.add_nodes_from(group_ids)
else:
edge_weights = np.full([num_groups, num_groups], np.nan)
exceptions_list = list()
for src in range(num_groups):
# primitive progress indicator
if np.mod(src + 1, 5) == 0.0:
sys.stdout.write('.')
index1 = groups == group_ids[src]
hist_one = compute_histogram(features[index1], edges, use_orig_distr)
if non_symmetric:
target_list = range(num_groups)
else:
# when symmetric, only upper tri matrix is computed/filled
target_list = range(src + 1, num_groups, 1)
for dest in target_list:
# skipping edge between self
if src == dest:
continue
index2 = groups == group_ids[dest]
hist_two = compute_histogram(features[index2], edges, use_orig_distr)
try:
edge_value = weight_func(hist_one, hist_two)
if return_networkx_graph:
graph.add_edge(group_ids[src], group_ids[dest],
weight=float(edge_value))
else:
edge_weights[src, dest] = edge_value
except (RuntimeError, RuntimeWarning) as runexc:
# placeholder to ignore some runtime errors (such as medpy's logger issue)
print(runexc)
except BaseException as exc:
# numerical instabilities can cause trouble for histogram distance calculations
traceback.print_exc()
exceptions_list.append(str(exc))
logging.warning('Unable to compute edge weight between '
' {} and {}. Skipping it.'
''.format(group_ids[src], group_ids[dest]))
error_thresh = 0.05
if len(exceptions_list) >= error_thresh * num_links:
print('All exceptions encountered so far:\n'
' {}'.format('\n'.join(exceptions_list)))
raise ValueError('Weights for atleast {:.2f}% of edges could not be computed.'
''.format(error_thresh * 100))
sys.stdout.write('\n')
if return_networkx_graph:
if out_weights_path is not None:
graph.write_graphml(out_weights_path)
return graph
else:
if out_weights_path is not None:
np.savetxt(out_weights_path, edge_weights, delimiter=',', fmt='%.9f')
return edge_weights
def identify_groups(groups):
"""
To compute number of unique elements in a given membership specification.
Parameters
----------
groups : numpy 1d array of length p,
each value specifying which group that particular node belongs to.
For examlpe, if you have a cortical thickness values for 1000 vertices
belonging to 100 patches, this array could have numbers 1 to 100
specifying which vertex belongs to which cortical patch. Although grouping
with numerical values (contiguous from 1 to num_patches) is strongly
recommended for simplicity, this could also be a list of strings of length p.
Returns
-------
group_ids : numpy array of values identifying the unique groups specified
num_groups : scalar value denoting the number of unique groups specified
"""
group_ids = np.unique(groups)
num_groups = len(group_ids)
if num_groups < 2:
raise ValueError('There must be atleast two nodes or groups in data, '
'for pair-wise edge-weight calculations.')
return group_ids, num_groups
def check_param_ranges(num_bins, num_groups, num_values, trim_outliers, trim_percentile):
"""Ensuring the parameters are in valid ranges."""
if num_bins < minimum_num_bins:
raise ValueError('Too few bins! The number of bins must be >= 5')
if num_values < num_groups:
raise ValueError('Insufficient number of values '
'in features (< number of nodes), or invalid membership!')
if trim_outliers:
if trim_percentile < 0 or trim_percentile >= 100:
raise ValueError('percentile of tail values to trim '
'must be in the semi-open interval [0,1).')
elif num_values < 2:
raise ValueError('too few features to compute minimum and maximum')
return
def type_cast_params(num_bins, edge_range_spec, features, groups):
"""Casting inputs to required types."""
if isinstance(num_bins, str):
# possible when called from CLI
num_bins = np.float(num_bins)
if np.isnan(num_bins) or np.isinf(num_bins):
raise ValueError('Invalid value for number of bins!'
' Choose a natural/finite number >= {}'
''.format(minimum_num_bins))
# rounding it to ensure it is int
num_bins = int(num_bins)
if edge_range_spec is None:
edge_range = edge_range_spec
elif isinstance(edge_range_spec, Sequence):
if len(edge_range_spec) != 2:
raise ValueError('edge_range must be a tuple of two values: (min, max)')
if edge_range_spec[0] >= edge_range_spec[1]:
raise ValueError('edge_range : min {} is not less than the max {} !'
''.format(edge_range_spec[0], edge_range_spec[1]))
if not np.all(np.isfinite(edge_range_spec)):
raise ValueError('Infinite or NaN values in edge range : {}'
''.format(edge_range_spec))
# converting it to tuple to make it immutable
edge_range = tuple(edge_range_spec)
else:
raise ValueError('Invalid edge range!'
' Must be a tuple of two values (min, max)')
if not isinstance(features, np.ndarray):
features = np.array(features)
if not isinstance(groups, np.ndarray):
groups = np.array(groups)
return num_bins, edge_range, features, groups
def make_random_histogram(length=100, num_bins=10):
"Returns a sequence of histogram density values that sum to 1.0"
hist, bin_edges = np.histogram(np.random.random(length),
bins=num_bins, density=True)
# to ensure they sum to 1.0
hist = hist / sum(hist)
if len(hist) < 2:
raise ValueError('Invalid histogram')
return hist
def check_weight_method(weight_method_spec,
use_orig_distr=False,
allow_non_symmetric=False):
"Check if weight_method is recognized and implemented, or ensure it is callable."
if not isinstance(use_orig_distr, bool):
raise TypeError('use_original_distribution flag must be boolean!')
if not isinstance(allow_non_symmetric, bool):
raise TypeError('allow_non_symmetric flag must be boolean')
if isinstance(weight_method_spec, str):
weight_method_spec = weight_method_spec.lower()
if weight_method_spec in list_medpy_histogram_metrics:
from medpy.metric import histogram as medpy_hist_metrics
weight_func = getattr(medpy_hist_metrics, weight_method_spec)
if use_orig_distr:
warnings.warn('use_original_distribution must be False '
'when using builtin histogram metrics, '
'which expect histograms as input'
' - setting it to False.', HiwenetWarning)
use_orig_distr = False
elif weight_method_spec in list_hiwenet_histogram_metrics:
weight_func = getattr(more_metrics, weight_method_spec)
elif weight_method_spec in metrics_on_original_features:
weight_func = getattr(more_metrics, weight_method_spec)
if not use_orig_distr:
warnings.warn('use_original_distribution must be True'
' when using builtin non-histogram metrics,'
' which expect original feature values in ROI/node '
'as input - setting it to True.', HiwenetWarning)
use_orig_distr = True
if weight_method_spec in symmetric_metrics_on_original_features:
print('Chosen metric is symmetric. Ignoring asymmetric=False flag.')
allow_non_symmetric=False
else:
raise NotImplementedError('Chosen histogram distance/metric {}'
' not implemented or invalid.'
''.format(weight_method_spec))
elif callable(weight_method_spec):
# ensure 1) takes two ndarrays
try:
dummy_weight = weight_method_spec(make_random_histogram(), make_random_histogram())
except:
raise TypeError('Error applying given callable on two input arrays.\n'
'{} must accept 2 arrays and return a 1 scalar value!')
else:
# and 2) returns only one number
if not np.isscalar(dummy_weight):
raise TypeError('Given callable does not return '
'a single scalar as output.')
weight_func = weight_method_spec
else:
raise ValueError('Supplied method to compute edge weight is not recognized:\n'
'must be a string identifying one of the implemented methods\n{}'
'\n or a valid callable that accepts that two arrays '
'and returns 1 scalar.'.format(list_medpy_histogram_metrics))
return weight_func, use_orig_distr, allow_non_symmetric
def check_params(features_spec, groups_spec, num_bins, edge_range_spec,
trim_outliers, trim_percentile):
"""Necessary check on values, ranges, and types."""
if isinstance(features_spec, str) and isinstance(groups_spec, str):
features, groups = read_features_and_groups(features_spec, groups_spec)
else:
features, groups = features_spec, groups_spec
num_bins, edge_range, features, groups = \
type_cast_params(num_bins, edge_range_spec, features, groups)
num_values = len(features)
# memberships
group_ids, num_groups = identify_groups(groups)
num_links = np.int64(num_groups * (num_groups - 1) / 2.0)
check_param_ranges(num_bins, num_groups, num_values, trim_outliers, trim_percentile)
return features, groups, num_bins, edge_range, group_ids, num_groups, num_links
[docs]def run_cli():
"Command line interface to hiwenet."
features_path, groups_path, weight_method, num_bins, edge_range, \
trim_outliers, trim_percentile, return_networkx_graph, out_weights_path = parse_args()
# TODO add the possibility to process multiple combinations of parameters:
# diff subjects, diff metrics
# for features_path to be a file containing multiple subjects (one/line)
# -w could take multiple values kldiv,histint,
# each line: input_features_path,out_weights_path
features, groups = read_features_and_groups(features_path, groups_path)
extract(features, groups,
weight_method=weight_method, num_bins=num_bins,
edge_range=edge_range, trim_outliers=trim_outliers,
trim_percentile=trim_percentile,
return_networkx_graph=return_networkx_graph,
out_weights_path=out_weights_path)
def read_features_and_groups(features_path, groups_path):
"Reader for data and groups"
try:
if not pexists(features_path):
raise ValueError('non-existent features file')
if not pexists(groups_path):
raise ValueError('non-existent groups file')
if isinstance(features_path, str):
features = np.genfromtxt(features_path, dtype=float)
else:
raise ValueError('features input must be a file path ')
if isinstance(groups_path, str):
groups = np.genfromtxt(groups_path, dtype=str)
else:
raise ValueError('groups input must be a file path ')
except:
raise IOError('error reading the specified features and/or groups.')
if len(features) != len(groups):
raise ValueError("lengths of features and groups do not match!")
return features, groups
def get_parser():
"Specifies the arguments and defaults, and returns the parser."
parser = argparse.ArgumentParser(prog="hiwenet")
parser.add_argument("-f", "--in_features_path", action="store",
dest="in_features_path",
required=True,
help="Abs. path to file containing features for a given subject")
parser.add_argument("-g", "--groups_path", action="store", dest="groups_path",
required=True,
help="path to a file containing element-wise membership "
"into groups/nodes/patches.")
parser.add_argument("-w", "--weight_method", action="store", dest="weight_method",
default=default_weight_method, required=False,
help="Method used to estimate the weight between "
"the pair of nodes. Default : {}".format(
default_weight_method))
parser.add_argument("-o", "--out_weights_path", action="store",
dest="out_weights_path",
default=default_out_weights_path, required=False,
help="Where to save the extracted weight matrix. "
"If networkx output is returned, "
"it would be saved in GraphML format. "
"Default: nothing saved.")
parser.add_argument("-n", "--num_bins", action="store", dest="num_bins",
default=default_num_bins, required=False,
help="Number of bins used to construct the histogram."
" Default : {}".format(default_num_bins))
parser.add_argument("-r", "--edge_range", action="store", dest="edge_range",
default=default_edge_range, required=False,
nargs = 2,
help="The range of edges (two finite values) within which "
"to bin the given values e.g. --edge_range 1 6 This "
"can be helpful to ensure correspondence across "
"multiple invocations of hiwenet (for different "
"subjects), in terms of range across all bins as well "
"as individual bin edges. Default : {}, "
"to automatically compute from the given "
"values.".format(
default_edge_range))
parser.add_argument("-t", "--trim_outliers", action="store", dest="trim_outliers",
default=default_trim_behaviour, required=False,
help="Boolean flag indicating whether to trim "
"the extreme/outlying values. Default True.")
parser.add_argument("-p", "--trim_percentile", action="store", dest="trim_percentile",
default=default_trim_percentile, required=False,
help="Small value specifying the percentile of outliers to trim. "
"Default: {0}%% , must be in open interval (0, 100)."
"".format(default_trim_percentile))
parser.add_argument("-x", "--return_networkx_graph", action="store",
dest="return_networkx_graph",
default=default_return_networkx_graph, required=False,
help="Boolean flag indicating whether to return a networkx "
"graph populated with weights computed. Default: False")
return parser
def parse_args():
"""Parser/validator for the cmd line args."""
parser = get_parser()
if len(sys.argv) < 2:
parser.print_help()
warnings.warn('Too few arguments!', UserWarning)
parser.exit(1)
# parsing
try:
params = parser.parse_args()
except Exception as exc:
print(exc)
raise ValueError('Unable to parse command-line arguments.')
in_features_path = os.path.abspath(params.in_features_path)
if not os.path.exists(in_features_path):
raise IOError("Given features file doesn't exist.")
groups_path = os.path.abspath(params.groups_path)
if not os.path.exists(groups_path):
raise IOError("Given groups file doesn't exist.")
return in_features_path, groups_path, params.weight_method, params.num_bins, \
params.edge_range, params.trim_outliers, params.trim_percentile, \
params.return_networkx_graph, params.out_weights_path
if __name__ == '__main__':
run_cli()