Coverage for estats/summary.py: 68%

2# Institute for Particle Physics and Astrophysics

3# Author: Dominik Zuercher

5import numpy as np

6from tqdm import tqdm

7import copy

8import matplotlib.pyplot as plt

9from estats import utils

11from ekit import paths as paths

12from ekit import context

13from ekit import logger

15LOGGER = logger.init_logger(__name__)

18class summary:

19 """

20 The summary module is meant to postprocess summary statistics measurements.

22 The main functionality of the summary module is to calculate mean

23 data-vectors, standard deviations and covariance or precision matrices

24 for the summary statistics at different parameter configurations,

25 based on a set of realizations of the summary statistic at each

26 configuration.

28 The meta data (e.g. cosmology setting, precision settings, tomographic

29 bin and so on) for each set of realizations (read-in from a file or an

30 array directly) can be

31 given to the module on read-in directly or parsed from the filename.

32 Directly after read-in a first postprocessing can be done using the process

33 function defined in the statistic plugin.

34 The read-in data-vectors are stored appended to a data table for each

35 statistic and the meta data is added to an internal meta data table.

36 The realizations are ordered according to their meta data entry. There

37 are two special entries for the meta data (tomo: label for the tomographic

38 bin of the data-vectors, NREALS: the number of data-vectors associated to

39 each entry (is inferred automatically)).

40 All other entries can be defined by the user.

42 The summary module allows to downbin the potentially very long data-vectors

43 into larger bins using a binning scheme.

44 The decide_binning_scheme function in the statistic plugin is used to

45 decide on

46 that scheme, which defines the edges of the large bins based on the bins

47 of the original data-vectors. For plotting purposes the binning scheme

48 can also define the values of each data bin (for example its

49 signal-to-noise ratio).

50 The slice function in the statistic plugin then defines how exactly

51 the binning scheme is used to downbin each data-vector.

52 See the :ref:`own_stat` Section for more details.

54 The summary module allows to combine summary statistics calculated for

55 different tomographic bins

56 to perform a tomographic analysis. The tomo entry in the meta data table

57 defines the label of the tomographic bin for each set of data-vector

58 realizations. One can define the order of the labels when combined into

59 a joint data-vector using the cross_ordering keyword.

61 The summary module also allows to combine different summary

62 statistics into a joint data-vector.

64 The most important functionalities are:

66 - generate_binning_scheme:

68 Uses the decide_binning_scheme function from the statistic plugin to

69 create a binning scheme. The scheme can be created for different

70 tomographic bins and scales.

71 See the Section :ref:`own_stat` for more details.

73 - readin_stat_files:

75 Reads in data-vector realizations from a file. The process function

76 from the statistics plugin is used to perform a first processing of

77 the data. The meta data for each file can either be given directly or

78 can be parsed from the file name by giving a

79 list of parameters indicating the fields to be parsed (using ekit).

81 - downbin_data:

83 Uses the created binning scheme to bin the data-vector entries into

84 larger bins.

85 Uses the slice function from the statistics plugin to do so.

87 - join_redshift_bins:

89 Joins all data-vector realizations of a specific statistic at the same

90 configuration. The tomo entry in the meta data table

91 defines the label of the tomographic bin for each set of data-vector

92 realizations. One can define the order of the labels when combined into

93 a joint data-vector

94 using the cross_ordering keyword. If for a specific parameter

95 configuration different number of realizations are found for different

96 tomographic bins, only the minimum number of realizations is used to

97 calculate the combined data-vectors.

99 - join_statistics:

100

101 Creates a new statistic entry including the data table and the meta

102 data table, by concatenating the data-vectors of a set of statistics.

103 The new statistic has the name statistic1-statistic2-...

104 If for a specific parameter configuration

105 different number of realizations are found for different statistics,

106 only the minimum number of realizations is used to calculate the

107 combined data-vectors.

108

109 - get_means:

110

111 Returns the mean data vectors of a statistic for the different

112 parameter configurations.

113

114 - get_meta:

115

116 Returns the full meta data table for a statistic.

117

118 - get_errors:

119

120 Returns the standard deviation of the data vectors of a statistic

121 for the different configurations.

122

123 - get_covariance_matrices:

124

125 Returns the covariance matrices estimated from the realizations at

126 each configuration. Can also invert the covariance matrices directly

127 to obtain the precision matrices.

128

129 The accepted keywords are:

130

131 - cross_ordering:

132

133 default: []

134

135 choices: a list of labels

136

137 Indicates the order of the tomographic bin labels that is used by

138 join_redshift_bins

139 to combine data-vectors from different tomographic bins.

140

141 The labels could be bin1xbin2 for example, and the corresponding

142 cross_ordering could be [1x1, 1x2, 2x2, 2x3, 3x3].

143 """

144

145 def __init__(self, context_in={}, verbosity=3, **kwargs):

146 """

147 Initialization function.

148

149 :param context_in: Context instance

150 :param verbosity: Verbosity level (0-4)

151 """

152

153 logger.set_logger_level(LOGGER, verbosity)

154

155 LOGGER.debug("Initialized summary object with no data so far.")

156

157 allowed = ['cross_ordering']

158 defaults = [[]]

159 types = ['list']

160

161 allowed, types, defaults = utils._get_plugin_contexts(

162 allowed, types, defaults)

163 self.ctx = context.setup_context(

164 {**context_in, **kwargs}, allowed, types, defaults,

165 verbosity=verbosity)

166

167 self.meta = {}

168 self.data = {}

169 self.bin_centers = {}

170 self.bin_edges = {}

171

172 # ACCESING OBJECTS

173 def get_data(self, statistic='Peaks', params=None):

174 """

175 Get full data array for a certain statistic

176 :param statistic: Statistic for which to return data array

177 :param params: Can provide dictionary of parameter values.

178 Returns only realisations with those parameters.

179 :return: All data vector realisations for the statistic

180 """

181

182 if statistic == 'all':

183 return self.data

184 else:

185 if params is not None:

186 check = [True] * self.meta[statistic].shape[0]

187 for item in params.items():

188 if item[0] not in self.parameters:

189 continue

190 item_row = np.where(

191 np.asarray(self.parameters) == item[0])[0]

192 check &= np.isclose(

193 self.meta[statistic][:, item_row].astype(

194 type(item[1])).reshape(-1), item[1])

195 ids = np.where(check)[0]

196 out = np.zeros((0, self.data[statistic].shape[1]))

197 for id in ids:

198 if id == 0:

199 start = 0

200 else:

201 start = int(

202 np.sum(self.meta[statistic][:id, 1].astype(float)))

203 end = start + int(

204 np.asarray(self.meta[statistic][id, 1]).astype(float))

205 out = np.vstack((out, self.data[statistic][start:end]))

206 else:

207 out = self.data[statistic]

208 return out

209

210 def get_meta(self, statistic='Peaks'):

211 """

212 Get full meta table for a certain statistic

213 :param statistic: Statistic for which to return meta data

214 :return: Full meta table for the statistic

215 """

216 dtype = []

217 if statistic not in self.meta.keys():

218 raise Exception(

219 "Meta data table not set for statistic {}".format(statistic))

220 if len(self.meta[statistic]) == 0:

221 LOGGER.error(

222 f"Meta data for statistic {statistic} is empty.")

223 return []

224 for ii in self.meta[statistic][0, :]:

225 try:

226 float(ii)

227 dtype.append('f8')

228 continue

229 except ValueError:

230 pass

231 try:

232 int(ii)

233 dtype.append('i4')

234 continue

235 except ValueError:

236 pass

237 dtype.append('<U50')

238 fields = []

239 for line in self.meta[statistic]:

240 fields.append(tuple(line))

241 dtype = np.dtype({'names': tuple(self.parameters),

242 'formats': tuple(dtype)})

243 to_return = np.core.records.fromrecords(fields, dtype=dtype)

244 return to_return

245

246 def get_binning_scheme(self, statistic='Peaks', bin=0):

247 """

248 Get binning scheme for a certain statistic and tomographic bin

249 :param statistic: Statistic for which to return the binning scheme

250 :param bin: Tomographic bin for which to return the binning scheme

251 :return: bin edges and bin centers

252 """

253 return (self.bin_edges[bin][statistic][:],

254 self.bin_centers[bin][statistic][:])

255

256 def get_means(self, statistic='Peaks'):

257 """

258 Returns the mean datavectors for all different parameter

259 configurations for a given statistic.

260 :param statistic: Statistic for which to return means

261 :return: An array containing the mean data vectors as

262 (number of parameter configurations, data vector length)

263 """

264 pointer = 0

265 means = np.zeros(

266 (self.meta[statistic].shape[0], self.data[statistic].shape[1]))

267 for ii, entry in enumerate(self.meta[statistic]):

268 means[ii, :] = np.nanmean(

269 self.data[statistic][pointer:pointer + int(float(entry[1])),

270 :],

271 axis=0)

272 pointer += int(float(entry[1]))

273 return means

274

275 def get_errors(self, statistic='Peaks'):

276 """

277 Returns the error datavectors for all different parameter

278 configurations for a given statistic.

279

280 :param statistic: Statistic for which to return errors

281 :return: An array containing the error data vectors as

282 (number of parameter configurations, data vector length)

283 """

284 pointer = 0

285 errors = np.zeros(

286 (self.meta[statistic].shape[0], self.data[statistic].shape[1]))

287 for ii, entry in enumerate(self.meta[statistic]):

288 errors[ii, :] = np.nanstd(

289 self.data[statistic][pointer:pointer + int(float(entry[1])),

290 :],

291 axis=0)

292 pointer += int(float(entry[1]))

293 return errors

294

295 def get_covariance_matrices(self, statistic='Peaks', invert=False,

296 set_to_id_if_failed=False,

297 plot_fiducial_correlation=False,

298 plot_fiducial_incov=False,

299 Fiducials=[], store_fiducial_cov=False):

300 """

301 Returns the covariance matrices for all different parameter

302 configurations for a given statistic.

303 :param statistic: Statistic for which to return errors

304 :param invert: If True attempts to invert the covariance matrices.

305 If numerically unstable raises Error

306 :param set_to_id_if_failed: If True sets matrices to identity matrix

307 if the inversion fails.

308 :param plot_fiducial_correlation: If True plots the correlation matrix

309 and covariance matrix for the fiducial parameter setting

310 indicated by Fiducials

311 :param plot_fiducial_incov: If True plots the inverted covariance

312 matrix for the fiducial parameter setting

313 indicated by Fiducials

314 :param Fiducials: The fiducial parameter values for which to plot

315 :param store_fiducial_cov: If True stores fiducial covariance matrix

316 separately (less RAM in MCMC)

317 :return: An array containing the (inverted) covariance matrices as

318 (number of parameter configurations, data vector length,

319 data vector length)

320 """

321 covs = self._calc_cov_matrices(

322 statistic, invert, set_to_id_if_failed,

323 plot_fiducial_correlation,

324 plot_fiducial_incov,

325 Fiducials, store_fiducial_cov=store_fiducial_cov)

326 return covs

327

328 def get_full_summary(self, statistic='Peaks', invert=False,

329 set_to_id_if_failed=False,

330 plot_fiducial_correlation=False,

331 plot_fiducial_incov=False,

332 Fiducials=[],

333 label='',

334 check_stability=False, calc_covs=True,

335 calc_fiducial_cov=False, store_fiducial_cov=False):

336 """

337 Returns the mean datavectors, errors and (inverted) covariance matrices

338 for all different parameter configurations for a given statistic.

339

340 :param statistic: Statistic for which to return errors

341 :param invert: If True attempts to invert the covariance matrices.

342 If numerically unstable raises Error

343 :param set_to_id_if_failed: If True sets matrices to identity matrix

344 if the inversion fails.

345 :param plot_fiducial_correlation: If True plots the correlation matrix

346 and covariance matrix

347 for the fiducial parameter setting

348 indicated by Fiducials

349 :param plot_fiducial_incov: If True plots the inverted covariance

350 matrix for the fiducial parameter setting

351 indicated by Fiducials

352 :param Fiducials: The fiducial parameter values for which to plot

353 :param label: Label used to create path for cov and corr matrix plots

354 :param check_stability: If True performs numerical stability checks

355 wheFn inverting covariance matrices

356 :param calc_covs: Can turn off the calculation of covariance matrices.

357 For example when SVD should be used.

358 :param calc_fiducial_cov: If True and calc_covs is False it only

359 computes the fiducial matrix

360 :param store_fiducial_cov: If True stores fiducial covariance matrix

361 separately (less RAM in MCMC)

362 :return: 3 objects. Means, errors and (inverted) covariance matrices

363 """

364

365 errors = self.get_errors(statistic)

366 means = self.get_means(statistic)

367

368 if calc_covs | calc_fiducial_cov:

369 covs = self._calc_cov_matrices(

370 statistic, invert, set_to_id_if_failed,

371 plot_fiducial_correlation,

372 plot_fiducial_incov,

373 Fiducials, label, check_stability,

374 store_fiducial_cov=store_fiducial_cov)

375 else:

376 covs = []

377 return (means, errors, covs)

378

379 # SETTING OBJECTS

380 def set_data(self, data, statistic='Peaks'):

381 """

382 Set full data array

383

384 :param data: Data array to set

385 :param statistic: The statistic for which to set the data

386 """

387 self.data[statistic] = data[:]

388

389 def set_meta(self, meta, statistic='Peaks', parameters=[]):

390 """

391 Set meta array

392

393 :param meta: Meta array to set

394 :param statistic: The statistic for which to set the meta data

395 :param parameters: A list indicating the parameter labels in the meta

396 data table

397 """

398 self.meta[statistic] = meta[:]

399 self.parameters = parameters[:]

400

401 def set_binning_scheme(self, bin_centers, bin_edges,

402 statistic='Peaks', bin=0):

403 """

404 Set binning scheme

405

406 :param bin_centers: Centers of the bins to set (only used for plotting)

407 :param bin_edges: Edges of the bins to set

408 :param statistic: The statistic for which to set the binning scheme

409 :param bin: The tomographic bin for which to set the binning scheme

410 """

411 try:

412 self.bin_centers[bin][statistic] = bin_centers[:]

413 self.bin_edges[bin][statistic] = bin_edges[:]

414 except KeyError:

415 self.bin_centers[bin] = {}

416 self.bin_edges[bin] = {}

417 self.bin_centers[bin][statistic] = bin_centers[:]

418 self.bin_edges[bin][statistic] = bin_edges[:]

419

420 def readin_stat_files(self, files, statistic='Peaks', meta_list=[],

421 parse_meta=False, parameters=[]):

422 """

423 Reads in data from files and does a first processesing.

424

425 :param files: A list of filepaths containing the realisations for

426 different parameter configurations

427 :param statistic: The name of the statistic to which the files belong

428 :param meta_list: Can provide a list for the meta data of the shape

429 (num_of files, num_of_parameters)

430 :param parse_meta: If set to True will try to get meta

431 parameters from file names (names need to be in

432 ekit format)

433 :param parameters: List of strings indicating the parameter names that

434 should be extracted from the file names

435 """

436 if type(files) is str:

437 files = [files]

438 if len(files) == 0:

439 return

440

441 plugin = utils.import_executable(

442 statistic, 'process')

443

444 for ii, file in tqdm(enumerate(files)):

445 data = np.load(file)

446 self.readin_stat_data(data, plugin, statistic=statistic,

447 meta_list=meta_list,

448 parse_meta_from_file_names=parse_meta,

449 parameters=parameters, file=file,

450 sort_data=False)

451 LOGGER.info("Completed readin of {} files for statistic {}".format(

452 len(files), statistic))

453 self._sort_data(statistic, copy.copy(parameters))

454

455 def rescale_data(self, statistic='Peaks'):

456 """

457 Devides each databin by the median of the samples.

458 Useful when performing SVD for example.

459

460 :param statistic: The name of the statistic to which the files belong

461 """

462 self.data[statistic] /= np.median(self.data[statistic], axis=0)

463

464 def readin_stat_data(self, data, plugin=None, statistic='Peaks',

465 meta_list=[],

466 parse_meta_from_file_names=False, parameters=[],

467 file='', sort_data=True):

468 """

469 Processesing of a chunk of realisations belonging to the same parameter

470 configuration.

471

472 :param data: An array containing the realisations as

473 (n_reals, length of datavectors)

474 :param statistic: The name of the statistic to which the realisations

475 belong

476 :param meta_list: Can provide a list for the meta data of the shape

477 (1, num_of_parameters)

478 :param parse_meta_from_file_names: If set to True will try to get meta

479 parameters from file name passed by

480 file (name need to be in ekit

481 format)

482 :param parameters: List of strings indicating the parameter names that

483 should be extracted from the file name

484 :param file: A path from which the meta data can be parsed

485 :param sort_data: If True performs lexsort of full data array

486 """

487

488 if plugin is None:

489 plugin = utils.import_executable(

490 statistic, 'process')

491

492 meta = self._get_meta(

493 data, meta_list, parse_meta_from_file_names,

494 parameters, file_name=file)

495 data = self._process_data(

496 plugin, data, statistic, meta[0, 0])

497

498 # update NREALS

499 NREALS = int(np.asarray([data.shape[0]]))

500 meta[0, 1] = NREALS

501

502 # stacking

503 try:

504 self.meta[statistic] = np.vstack((self.meta[statistic], meta))

505 except KeyError:

506 self.meta[statistic] = meta

507

508 try:

509 self.data[statistic] = np.vstack((self.data[statistic], data[:]))

510 except KeyError:

511 self.data[statistic] = data

512

513 if sort_data:

514 self._sort_data(statistic, copy.copy(parameters))

515

516 # METHODS

517

518 def generate_binning_scheme(self, statistics=['Peaks', 'Voids', 'CLs',

519 'Minkowski', '2PCF', '2VCF'],

520 bin=0):

521 """

522 Generates a binning scheme for different statistics for a given

523 tomographic bin.

524

525 :param statistics: A list of statistics for which to compute the

526 binning scheme

527 :param bin: The tomographic bin for which to calculate the binning

528 scheme

529 """

530 if isinstance(statistics, str):

531 statistics = [statistics]

532

533 for stat in statistics:

534 if ('-' in stat):

535 LOGGER.warning(

536 'Cannot make binning scheme '

537 'for joined statistics. Skipping...')

538 continue

539

540 bin_centers, bin_edges = self._decide_on_bins(

541 stat, bin)

542

543 try:

544 self.bin_centers[bin][stat] = bin_centers

545 self.bin_edges[bin][stat] = bin_edges

546 except KeyError:

547 self.bin_centers[bin] = {}

548 self.bin_edges[bin] = {}

549 self.bin_centers[bin][stat] = bin_centers

550 self.bin_edges[bin][stat] = bin_edges

551

552 def downbin_data(self, statistics=['Peaks', 'Voids', 'CLs',

553 'Minkowski']):

554 """

555 Use binning scheme to downbin data vectors into larger bins.

556

557 :param statistics: The statistics for which to perform the binning

558 """

559 if isinstance(statistics, str):

560 statistics = [statistics]

561 for stat in statistics:

562 self._slice_data(errors=[], stat=stat)

563

564 def join_statistics(self, statistics=['Peaks', 'CLs']):

565 """

566 Join datavectors for different statistics. Creates a new instance for

567 meta and data objects with the key as the single statistics

568 concatinated by '-'s.

569

570 :param statistics: The statistics which should be combined.

571 """

572

573 # check that there is meta and data for all stats

574 for s in statistics:

575 if s not in self.meta:

576 raise Exception(f"Did not find meta data for statistic {s}")

577 if s not in self.data:

578 raise Exception(f"Did not find data for statistic {s}")

579

580 # get entries present in all statistics

581 total_entries = np.zeros(

582 (0, self.meta[statistics[0]].shape[1]), dtype=float)

583 for s in statistics:

584 total_entries = np.vstack((total_entries, self.meta[s]))

585 # remove tomo and NREALS

586 total_tomos = total_entries[:, 0]

587 total_entries = total_entries[:, 2:]

588 unique_entries = np.unique(total_entries.astype(float), axis=0)

589 unique_tomos = np.unique(total_tomos.astype(float), axis=0)

590 if len(unique_tomos) > 1:

591 raise Exception("Not all entries have same tomographic bin")

592 if len(unique_entries) == 0:

593 raise Exception("Found no entries to combine")

594

595 data_vector_length = 0

596 for s in statistics:

597 data_vector_length += self.data[s].shape[1]

598 new_meta = np.zeros(

599 (0, self.meta[statistics[0]].shape[1]), dtype=object)

600 new_data = np.zeros((0, data_vector_length))

601

602 # combine each entry

603 for entry in unique_entries:

604 skip = False

605 idxs = []

606 for s in statistics:

607 idx = np.array([True] * self.meta[s].shape[0])

608 for ii in range(unique_entries.shape[1]):

609 to_check = self.meta[s][:, ii + 2].astype(float)

610 idx = np.logical_and(

611 idx, np.isclose(to_check, entry[ii]))

612 # check that idx has only one entry

613 if int(np.sum(idx)) != 1:

614 LOGGER.warning(

615 f"For some reason found {int(np.sum(idx))} "

616 f"corresponding entries for statitic {s}. "

617 f"Skipping entry {entry}")

618 skip = True

619 break

620 idxs.append(np.where(idx)[0][0])

621 if skip:

622 continue

623 block_lengths = [self.meta[statistics[xx]][idxs[xx], 1]

624 for xx in range(len(statistics))]

625 block_length = int(np.min(np.asarray(block_lengths, dtype=float)))

626 new_meta_ = copy.copy(self.meta[statistics[0]][idxs[0]])

627 new_meta_[1] = block_length

628 new_meta = np.vstack((new_meta, new_meta_))

629

630 new_data_ = np.zeros((block_length, 0))

631 for ids, s in enumerate(statistics):

632 lower = int(np.sum(self.meta[s][:idxs[ids], 1].astype(float)))

633 new_data_temp = self.data[s][lower:lower + block_length, :]

634 new_data_ = np.hstack((new_data_, new_data_temp))

635 new_data = np.vstack((new_data, new_data_))

636

637 # set the new meta array

638 self.meta['-'.join(statistics)] = new_meta

639 self.data['-'.join(statistics)] = new_data

640

641 LOGGER.info("Constructed joined statistic {}".format(

642 '-'.join(statistics)))

643

644 def join_redshift_bins(self):

645 """

646 Concatenates datavector realisations for the different tomographic

647 bins. The old single bin entries get deleted and the new entries have

648 tomographic bin set to -1.

649 """

650 for statistic in self.data.keys():

651 # get all tomographic bins in data

652 bins = np.unique(self.get_meta(statistic)['tomo'])

653 # if only one bin do nothing but set tomo to -1 to indicate

654 if len(bins) == 1:

655 LOGGER.warning(

656 "Cannot join tomographic bins since all samples have "

657 "same tomographic bin. Skipping")

658 self.meta[statistic][:, 0] = -1

659 continue

660 if len(self.ctx['cross_ordering']) > 0:

661 bins_ = self.ctx['cross_ordering']

662 new_bins = []

663 for bin in bins_:

664 if bin in bins:

665 new_bins.append(bin)

666 bins = new_bins

667 else:

668 LOGGER.warning(

669 "Did not find cross_ordering entry. "

670 "Trying to guess the order of the tomographic bins.")

671

672 # get all configurations in data

673 meta = self.get_meta(statistic)

674 new_meta = np.zeros((meta.shape[0], 0))

675 fields = meta.dtype.fields.keys()

676 parameters = []

677 for field in fields:

678 if (field != 'tomo') & (field != 'NREALS'):

679 new_meta = np.hstack((new_meta,

680 meta[field].reshape(-1, 1)))

681 parameters.append(field)

682 unique_meta = np.unique(new_meta, axis=0)

683

684 # loop over the unique entries

685 temp_data = np.zeros((0,

686 self.data[statistic].shape[1] * len(bins)))

687 temp_meta = np.zeros((0, self.meta[statistic].shape[1]),

688 dtype=self.meta[statistic].dtype)

689 for entry in tqdm(unique_meta):

690 check = np.ones(self.meta[statistic].shape[0], dtype=bool)

691 for ii, field in enumerate(parameters):

692 for jj in range(check.size):

693 check[jj] &= (str(meta[field][jj]) == str(entry[ii]))

694

695 if np.sum(check) != len(bins):

696 LOGGER.warning(

697 "For entry with parameters {} there are some "

698 "bins missing. Skipping".format(entry))

699 continue

700

701 # get starting and stopping points

702 starts = []

703 stops = []

704 for e in np.where(check)[0]:

705 starts.append(np.sum(

706 meta['NREALS'][:e]))

707 stops.append(

708 np.sum(meta['NREALS'][:e + 1]))

709 starts = np.array(starts, dtype=int)

710 stops = np.array(stops, dtype=int)

711

712 # decide on maximum number of realisations

713 diffs = stops - starts

714 max = np.min(diffs)

715 diffs = diffs - max

716 stops = stops - diffs

717

718 # append combined data

719 meta_ = meta[check]

720 temp_ = np.zeros((max, 0))

721 for bin in bins:

722 idx = np.where(meta_['tomo'] == bin)[0][0]

723 temp_ = np.hstack((

724 temp_, self.data[statistic][starts[idx]:stops[idx]]))

725 temp_data = np.vstack((temp_data, temp_))

726

727 # append combined meta

728 meta_ = meta_[0]

729 meta_['NREALS'] = max

730 meta_['tomo'] = -1

731 meta_ = np.asarray(list(meta_))

732 temp_meta = np.vstack((temp_meta, meta_.reshape(1, -1)))

733

734 # update class object

735 self.data[statistic] = temp_data

736 self.meta[statistic] = temp_meta

737

738 LOGGER.info(

739 "Joined data from tomographic bins for statistic {}. "

740 "The order is {}".format(statistic, bins))

741

742 ##################################

743 # HELPER FUNCTIONS

744 ##################################

745

746 def _calc_cov_matrices(self, statistic, invert=False,

747 set_to_id_if_failed=False,

748 plot_fiducial_correlation=False,

749 plot_fiducial_incov=False,

750 Fiducials=[],

751 label='',

752 check_stability=False, calc_fiducial_cov=False,

753 store_fiducial_cov=False):

754 """

755 Calculation and inversion of covariance matrices

756 """

757 pointer = 0

758 covs = np.zeros(

759 (self.meta[statistic].shape[0], self.data[statistic].shape[1],

760 self.data[statistic].shape[1]))

761 for ii, entry in enumerate(self.meta[statistic]):

762 if plot_fiducial_correlation | calc_fiducial_cov:

763 check = True

764 for xx, par in enumerate(entry[2:]):

765 check_ = np.isclose(float(par), Fiducials[xx])

766 check &= check_

767 if (calc_fiducial_cov) & (not check):

768 continue

769 mx = self.data[statistic][pointer:pointer + int(float(entry[1]))]

770 c = np.cov(mx, rowvar=False)

771

772 if store_fiducial_cov:

773 if check:

774 np.save(

775 'fid_cov_mat_{}_{}.npy'.format(statistic, label), c)

776 if plot_fiducial_correlation:

777 if check:

778 plt.figure(figsize=(12, 8))

779 plt.imshow(np.ma.corrcoef(mx, rowvar=False,

780 allow_masked=True))

781 plt.xticks(fontsize=20)

782 plt.yticks(fontsize=20)

783 cbar = plt.colorbar()

784 cbar.ax.tick_params(labelsize=20)

785 plt.savefig('corr_mat_{}_{}.pdf'.format(statistic, label))

786 plt.clf()

787

788 plt.figure(figsize=(12, 8))

789 plt.imshow(c)

790 plt.xticks(fontsize=20)

791 plt.yticks(fontsize=20)

792 cbar = plt.colorbar()

793 cbar.ax.tick_params(labelsize=20)

794 plt.savefig('cov_mat_{}_{}.pdf'.format(statistic, label))

795 plt.clf()

796

797 if invert:

798 try:

799 incov = np.linalg.solve(c, np.eye(c.shape[0]))

800 except np.linalg.LinAlgError:

801 # Fallback to tricks if inversion not possible

802 if (np.isclose(np.linalg.det(c), 0.0)):

803 LOGGER.warning(

804 "Determinant close to 0. Trying Inversion tricks")

805 c *= 10e20

806 if (np.isclose(np.linalg.det(c), 0.0)):

807 if set_to_id_if_failed:

808 LOGGER.error(

809 "Inversion of covariance failed. "

810 "SETTING PRECISON MATRIX TO IDENTITY")

811 incov = np.identity(c.shape[0])

812 else:

813 raise Exception(

814 "Error: Fiducial Covariance Matrix \

815 not invertible!!!")

816 else:

817 incov = np.linalg.pinv(c)

818 incov *= 10e20

819

820 else:

821 incov = np.linalg.pinv(c)

822

823 # check numerical stability of inversion

824 if check_stability:

825 id_check = np.dot(c, incov)

826 id = np.eye(id_check.shape[0])

827 id_check -= id

828 if not np.all(np.isclose(id_check, 0.0, atol=1e10,

829 rtol=0.0)):

830 raise Exception(

831 "Inversion of Fiducial Covariance matrix "

832 "did not pass numerical stability test")

833 else:

834 LOGGER.info("Successfully inverted Fiducial "

835 "Covariance matrix")

836

837 covs[ii] = incov

838

839 if plot_fiducial_incov:

840 check = True

841 for xx, par in enumerate(entry[2:]):

842 check_ = np.isclose(par, Fiducials[xx])

843 check &= check_

844 if check:

845 plt.figure(figsize=(12, 8))

846 plt.imshow(incov)

847 plt.xticks(fontsize=20)

848 plt.yticks(fontsize=20)

849 cbar = plt.colorbar()

850 cbar.ax.tick_params(labelsize=20)

851 plt.savefig(

852 'incov_mat_{}_{}.pdf'.format(statistic, label))

853 plt.clf()

854 else:

855 covs[ii] = c

856 if (calc_fiducial_cov):

857 if not check:

858 covs = [covs[ii]]

859 break

860 pointer += int(float(entry[1]))

861 return covs

862

863 def _get_meta_from_file_name(self, parameters, file):

864 """

865 Parse meta data from file names

866 """

867 defined = paths.get_parameters_from_path(file)[0]

868

869 meta = np.zeros(0, dtype=object)

870 for param in parameters:

871 meta = np.append(meta, defined[param][0])

872 return meta

873

874 def _process_data(self, plugin, raw_data, stat, tomo_bin,

875 trimmed_mask_dict={}):

876 """

877 Load a STATS file in the correct format

878 """

879

880 self.ctx['tomo_bin'] = tomo_bin

881 return plugin(raw_data, self.ctx, False)

882

883 def _stack_data(self, raw_data, num_of_scales):

884 """

885 Stacks data arrays

886 """

887 data = np.zeros(

888 (int(raw_data.shape[0] / num_of_scales), raw_data.shape[1]

889 * num_of_scales))

890 for jj in range(int(raw_data.shape[0] / num_of_scales)):

891 data[jj, :] = raw_data[jj * num_of_scales:

892 (jj + 1) * num_of_scales, :].ravel()

893 return data

894

895 def _get_meta(self, data, meta_list=[], parse_meta_from_file_names=False,

896 parameters=[],

897 file_name=''):

898 """

899 Helper function to get meta data

900 """

901 # Setting meta data

902 NREALS = int(np.asarray([data.shape[0]]))

903 if (len(meta_list) == 0) & (not parse_meta_from_file_names):

904 meta = NREALS

905 elif not parse_meta_from_file_names:

906 meta = np.array([NREALS] + meta_list, dtype=object)

907 elif parse_meta_from_file_names:

908 to_app = self._get_meta_from_file_name(parameters, file_name)

909 meta = np.append(NREALS, to_app)

910

911 # put tomographic bin to the beginning

912 if 'tomo' in parameters:

913 idx = np.asarray(parameters) == 'tomo'

914 idx = np.append([False], idx)

915 meta[idx] = meta[idx]

916 idx = np.where(idx)[0]

917 tomo_column = meta[idx]

918 meta = np.delete(meta, idx)

919 meta = np.append(tomo_column, meta)

920 else:

921 meta = np.append(np.zeros(1), meta)

922 return meta.reshape(1, -1)

923

924 def _get_sort_idx(self, parameters, statistic):

925 """

926 Performs sorting of meta data

927 """

928 # lexsort

929 idx = np.delete(np.arange(self.meta[statistic].shape[1]), 1)

930 sort_idx = np.lexsort(

931 np.flip(self.meta[statistic][:, idx], axis=1).transpose())

932

933 return sort_idx

934

935 def _slice_data(self, errors=[], stat='Peaks'):

936 """

937 Given some datavectors and a list of edge indices for the new bins

938 as created using decide_on_bins() down-samples the datavectors

939 """

940

941 plugin = utils.import_executable(stat, 'slice')

942

943 slicer_config = plugin(self.ctx)

944 num_of_scales, n_bins_sliced, operation, mult = slicer_config[:4]

945 if len(slicer_config) > 4:

946 range_mode_bool = slicer_config[4]

947 else:

948 range_mode_bool = False

949

950 n_bins_original = self.data[stat].shape[1] // (mult * num_of_scales)

951 new_data_total = np.zeros(

952 (self.data[stat].shape[0], mult * n_bins_sliced * num_of_scales))

953

954 bins = np.unique(self.meta[stat][:, 0])

955

956 for bin in bins:

957 # get meta entries for right bin

958 bin_idx = np.where(self.meta[stat][:, 0] == bin)[0]

959

960 # select data in the right bin

961 idx = np.zeros(self.data[stat].shape[0])

962 for ii in bin_idx:

963 start = int(np.sum(self.meta[stat][:, 1][:ii].astype(int)))

964 end = start + int(self.meta[stat][:, 1][ii])

965 idx[start:end] = 1

966 idx = idx.astype(bool)

967

968 if operation != 'none':

969 new_data = _slicer(

970 data=self.data[stat][idx],

971 num_of_samples=self.data[stat][idx].shape[0],

972 n_bins_original=n_bins_original,

973 n_bins_sliced=n_bins_sliced,

974 bin_edges=self.bin_edges[bin][stat],

975 num_of_scales=num_of_scales,

976 mult=mult, operation=operation,

977 range_mode_bool=range_mode_bool)

978 else:

979 new_data = self.data[stat][idx]

980

981 new_data_total[idx, :] = new_data

982

983 self.data[stat] = new_data_total

984

985 LOGGER.info(

986 "Down sampled data vectors for statistic {} from {} "

987 "bins to {} bins".format(

988 stat, n_bins_original, n_bins_sliced))

989

990 def _decide_on_bins(self, stat, bin=0):

991 """

992 Given some realisations of a datavector decides how to arange

993 a fixed number of bins

994 """

995

996 plugin = utils.import_executable(

997 stat, 'decide_binning_scheme')

998 try:

999 data = self.data[stat]

1000 meta = self.meta[stat]

1001 except KeyError:

1002 data = []

1003 meta = []

1004 bin_edge_indices, bin_centers = \

1005 plugin(data, meta, bin, self.ctx)

1006

1007 LOGGER.debug("Decided on binning scheme for statistic {}".format(

1008 stat))

1009

1010 return bin_centers, bin_edge_indices

1011

1012 def _sort_data(self, statistic, parameters):

1013 """

1014 Sort data array according to parameters

1015 """

1016 idx_sort = self._get_sort_idx(parameters, statistic)

1017

1018 # sort the blocks in the data

1019 temp_data = np.zeros_like(self.data[statistic])

1020 pointer = 0

1021 for ii in range(self.meta[statistic].shape[0]):

1022 block_length = int(self.meta[statistic][:, 1][idx_sort[ii]])

1023 start = int(np.sum(

1024 self.meta[statistic][:, 1][:idx_sort[ii]].astype(int)))

1025 end = start + int(block_length)

1026 to_paste = self.data[statistic][start:end, :]

1027 temp_data[pointer:pointer + int(block_length), :] = to_paste

1028 pointer += int(block_length)

1029

1030 # set the objects

1031 self.data[statistic] = temp_data

1032 self.meta[statistic] = self.meta[statistic][idx_sort, :]

1033 self.parameters = ['tomo', 'NREALS']

1034 parameters.remove('tomo')

1035 self.parameters += parameters

1036

1037

1038def _slicer(data, num_of_samples, n_bins_original, n_bins_sliced, bin_edges,

1039 num_of_scales=1, mult=1, operation='mean', get_std=False,

1040 range_mode_bool=False):

1041 """

1042 Slices data arrays into subbins

1043 """

1044 new_data = np.zeros((num_of_samples, mult * n_bins_sliced * num_of_scales))

1045

1046 for jj in range(num_of_scales):

1047 # loop over Minkowski functions

1048 for ii in range(mult):

1049 if range_mode_bool:

1050 data_act = data[:, int(n_bins_original * (mult * jj + ii)):int(

1051 n_bins_original * (mult * jj + ii + 1))]

1052 minima = data_act[:, 0]

1053 maxima = data_act[:, -1]

1054 original_bins = np.linspace(

1055 minima.reshape(-1), maxima.reshape(-1),

1056 num=n_bins_original - 1).T

1057 original_values = original_bins[:, :-1] + 0.5 * \

1058 (original_bins[:, 1:] - original_bins[:, :-1])

1059 original_weights = data_act[:, 1:-1]

1060

1061 new_bin_edges = bin_edges[jj]

1062 digi = np.digitize(original_values, bins=new_bin_edges) - 1

1063 # unfortunately this is slow but cannot think of a better

1064 # way at the moment

1065 for xx in range(n_bins_sliced):

1066 for index in range(original_weights.shape[0]):

1067 if operation == 'mean':

1068 new_data[index,

1069 n_bins_sliced * (ii + mult * jj) + xx] = \

1070 np.nanmean(

1071 original_weights[index, digi[index] == xx])

1072 elif operation == 'sum':

1073 new_data[index,

1074 n_bins_sliced * (ii + mult * jj) + xx] = \

1075 np.nansum(

1076 original_weights[index, digi[index] == xx])

1077 else:

1078 # loop over bins

1079 for xx in range(n_bins_sliced):

1080 # Slice out correct scale

1081 to_combine = data[:, int(n_bins_original * (mult * jj + ii)

1082 + bin_edges[jj][xx]):int(

1083 n_bins_original * (mult * jj + ii)

1084 + bin_edges[jj][xx + 1])]

1085

1086 if get_std:

1087 num = bin_edges[0, xx + 1] - bin_edges[0, xx]

1088 fac = 1. / np.sqrt(num)

1089 else:

1090 fac = 1.

1091

1092 if operation == 'mean':

1093 new_data[:, n_bins_sliced

1094 * (ii + mult * jj) + xx] = fac \

1095 * np.nanmean(to_combine, axis=1)

1096 elif operation == 'sum':

1097 new_data[:, n_bins_sliced

1098 * (ii + mult * jj) + xx] = fac \

1099 * np.nansum(to_combine, axis=1)

1100 return new_data