PythonAnalysis/test/testTables.py

0001 #!/usr/bin/env python3
0002
0003 # test for pytables
0004 # taken from https://kastnerkyle.github.io/posts/using-pytables-for-larger-than-ram-data-processing/
0005 # but with some interface modifications (presumably due to pytables changes)
0006
0007 import numpy as np
0008 import matplotlib
0009 matplotlib.use('Agg')
0010 import matplotlib.pyplot as plt
0011 import tables
0012
0013 random_state = np.random.RandomState(1999)
0014
0015 def make_random_cluster_points(n_samples, random_state=random_state):
0016     mu_options = np.array([(-1, -1), (1, 1), (1, -1), (-1, 1)])
0017     sigma = 0.2
0018     mu_choices = random_state.randint(0, len(mu_options), size=n_samples)
0019     means = mu_options[mu_choices]
0020     return means + np.random.randn(n_samples, 2) * sigma, mu_choices
0021
0022 def plot_clusters(data, clusters, name):
0023     plt.figure()
0024     colors = ["#9b59b6", "#3498db", "#e74c3c", "#2ecc71"]
0025     for i in np.unique(clusters):
0026         plt.scatter(data[clusters==i, 0], data[clusters==i, 1], color=colors[i])
0027     plt.axis('off')
0028     plt.title('Plot from %s' % name)
0029
0030 sample_data, sample_clusters = make_random_cluster_points(10000)
0031 hdf5_path = "my_data.hdf5"
0032 hdf5_file = tables.file.open_file(hdf5_path, mode='w')
0033 data_storage = hdf5_file.create_array(hdf5_file.root, 'data', sample_data)
0034 clusters_storage = hdf5_file.create_array(hdf5_file.root, 'clusters', sample_clusters)
0035 hdf5_file.close()
0036
0037 hdf5_path = "my_data.hdf5"
0038 read_hdf5_file = tables.file.open_file(hdf5_path, mode='r')
0039 # Here we slice [:] all the data back into memory, then operate on it
0040 hdf5_data = read_hdf5_file.root.data[:]
0041 hdf5_clusters = read_hdf5_file.root.clusters[:]
0042 read_hdf5_file.close()
0043
0044 plot_clusters(hdf5_data, hdf5_clusters, "PyTables Array")