Jupyter Snippet NP ch18-code-listing

Jupyter Snippet NP ch18-code-listing

Chapter 18: Code listing

Robert Johansson

Source code listings for Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib (ISBN 978-1-484242-45-2).

Imports

from __future__ import print_function
import numpy as np
np.random.seed(0)
import pandas as pd
import csv
import json
import h5py
import tables
import pickle
# import cPickle
import msgpack

CSV

%%writefile playerstats-2013-2014.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5
5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0
Overwriting playerstats-2013-2014.csv
%%writefile playerstats-2013-2014-top30.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5
5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0
6,Phil Kessel,TOR,R,82,37,43,80,-5,27,8,20,0,0,6,0,305,12.1,20:39,24.5,14.3
7,Taylor Hall,EDM,L,75,27,53,80,-15,44,7,17,0,1,1,1,250,10.8,20:00,25.4,45.7
8,Alex Ovechkin,WSH,L,78,51,28,79,-35,48,24,39,0,1,10,3,386,13.2,20:32,21.8,66.7
9,Joe Pavelski,SJS,C,82,41,38,79,+23,32,16,31,1,2,3,0,225,18.2,19:51,27.1,56.0
10,Jamie Benn,DAL,L,81,34,45,79,+21,64,5,19,1,3,3,1,279,12.2,19:09,25.0,52.8
11,Nicklas Backstrom,WSH,C,82,18,61,79,-20,54,6,44,1,1,1,0,196,9.2,19:48,23.3,50.4
12,Patrick Sharp,CHI,L,82,34,44,78,+13,40,10,25,0,0,3,1,313,10.9,18:53,22.7,54.6
13,Joe Thornton,SJS,C,82,11,65,76,+20,32,2,19,0,1,3,1,122,9.0,18:55,26.3,56.1
14,Erik Karlsson,OTT,D,82,20,54,74,-15,36,5,31,0,0,1,0,257,7.8,27:04,28.6,0.0
15,Evgeni Malkin,PIT,C,60,23,49,72,+10,62,7,30,0,0,3,0,191,12.0,20:03,21.4,48.8
16,Patrick Marleau,SJS,L,82,33,37,70,+0,18,11,23,2,2,4,0,285,11.6,20:31,27.3,52.9
17,Anze Kopitar,LAK,C,82,29,41,70,+34,24,10,23,0,0,9,2,200,14.5,20:53,25.4,53.3
18,Matt Duchene,COL,C,71,23,47,70,+8,19,5,17,0,0,6,1,217,10.6,18:29,22.0,50.3
19,Martin St. Louis,"TBL, NYR",R,81,30,39,69,+13,10,9,21,1,2,5,1,204,14.7,20:56,25.7,40.7
20,Patrick Kane,CHI,R,69,29,40,69,+7,22,10,25,0,0,6,0,227,12.8,19:36,22.9,50.0
21,Blake Wheeler,WPG,R,82,28,41,69,+4,63,8,19,0,0,4,2,225,12.4,18:41,24.0,37.5
22,Kyle Okposo,NYI,R,71,27,42,69,-9,51,5,15,0,0,4,1,195,13.8,20:26,22.2,47.5
23,David Krejci,BOS,C,80,19,50,69,+39,28,3,19,0,0,6,1,169,11.2,19:07,21.3,51.2
24,Chris Kunitz,PIT,L,78,35,33,68,+25,66,13,22,0,0,8,0,218,16.1,19:09,22.2,75.0
25,Jonathan Toews,CHI,C,76,28,40,68,+26,34,5,15,3,5,5,0,193,14.5,20:28,25.9,57.2
26,Thomas Vanek,"BUF, NYI, MTL",L,78,27,41,68,+7,46,8,18,0,0,4,0,248,10.9,19:21,21.6,43.5
27,Jaromir Jagr,NJD,R,82,24,43,67,+16,46,5,17,0,0,6,1,231,10.4,19:09,22.8,0.0
28,John Tavares,NYI,C,59,24,42,66,-6,40,8,25,0,0,4,0,188,12.8,21:14,22.3,49.1
29,Jason Spezza,OTT,C,75,23,43,66,-26,46,9,22,0,0,5,0,223,10.3,18:12,23.8,54.0
30,Jordan Eberle,EDM,R,80,28,37,65,-11,18,7,20,1,1,4,1,200,14.0,19:32,25.4,38.1
Overwriting playerstats-2013-2014-top30.csv
!head -n 5 playerstats-2013-2014-top30.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
rows = []
with open("playerstats-2013-2014.csv") as f:
    csvreader = csv.reader(f)
    rows = [fields for fields in csvreader]
rows[1][1:6]
['Player', 'Team', 'Pos', 'GP', 'G']
rows[2][1:6]
['Sidney Crosby', 'PIT', 'C', '80', '36']
data = np.random.randn(100, 3)
np.savetxt("data.csv", data, delimiter=",", header="x, y, z", comments="# Random x, y, z coordinates\n")
!head -n 5 data.csv
# Random x, y, z coordinates
x, y, z
1.764052345967664026e+00,4.001572083672232938e-01,9.787379841057392005e-01
2.240893199201457797e+00,1.867557990149967484e+00,-9.772778798764110153e-01
9.500884175255893682e-01,-1.513572082976978872e-01,-1.032188517935578448e-01
data_load = np.loadtxt("data.csv", skiprows=2, delimiter=",")
data_load[1,:]
array([ 2.2408932 ,  1.86755799, -0.97727788])
data_load.dtype
dtype('float64')
(data == data_load).all()
True
data = np.loadtxt("playerstats-2013-2014.csv", skiprows=2, delimiter=",", dtype=bytes)
data[0][1:6]
array([b'Sidney Crosby', b'PIT', b'C', b'80', b'36'], dtype='|S13')
np.loadtxt("playerstats-2013-2014.csv", skiprows=2, delimiter=",", usecols=[6,7,8])
array([[ 68., 104.,  18.],
       [ 56.,  87.,  28.],
       [ 58.,  86.,   7.],
       [ 47.,  84.,  16.],
       [ 39.,  82.,  32.]])
df = pd.read_csv("playerstats-2013-2014.csv", skiprows=1)
df = df.set_index("Rank")
df[["Player", "GP", "G", "A", "P"]]
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 1 to 5
Data columns (total 20 columns):
Player      5 non-null object
Team        5 non-null object
Pos         5 non-null object
GP          5 non-null int64
G           5 non-null int64
A           5 non-null int64
P           5 non-null int64
+/-         5 non-null int64
PIM         5 non-null int64
PPG         5 non-null int64
PPP         5 non-null int64
SHG         5 non-null int64
SHP         5 non-null int64
GW          5 non-null int64
OT          5 non-null int64
S           5 non-null int64
S%          5 non-null float64
TOI/GP      5 non-null object
Shift/GP    5 non-null float64
FO%         5 non-null float64
dtypes: float64(3), int64(13), object(4)
memory usage: 840.0+ bytes
df[["Player", "GP", "G", "A", "P"]].to_csv("playerstats-2013-2014-subset.csv")
!head -n 5 playerstats-2013-2014-subset.csv
Rank,Player,GP,G,A,P
1,Sidney Crosby,80,36,68,104
2,Ryan Getzlaf,77,31,56,87
3,Claude Giroux,82,28,58,86
4,Tyler Seguin,80,37,47,84

HDF5

h5py

import h5py
# mode = "w", "r", "w-", "r+", "a"
f = h5py.File("data.h5", "w")
f.mode
'r+'
f.flush()
f.close()
f = h5py.File("data.h5", "w")
f.name
'/'
grp1 = f.create_group("experiment1")
grp1.name
'/experiment1'
grp2_meas = f.create_group("experiment2/measurement")
grp2_meas.name
'/experiment2/measurement'
grp2_sim = f.create_group("experiment2/simulation")
grp2_sim.name
'/experiment2/simulation'
f["/experiment1"]
<HDF5 group "/experiment1" (0 members)>
f["/experiment2/simulation"]
<HDF5 group "/experiment2/simulation" (0 members)>
grp_expr2 = f["/experiment2"]
grp_expr2['simulation']
<HDF5 group "/experiment2/simulation" (0 members)>
list(f.keys())
['experiment1', 'experiment2']
list(f.items())
[('experiment1', <HDF5 group "/experiment1" (0 members)>),
 ('experiment2', <HDF5 group "/experiment2" (2 members)>)]
f.visit(lambda x: print(x))
experiment1
experiment2
experiment2/measurement
experiment2/simulation
f.visititems(lambda name, value: print(name, value))
experiment1 <HDF5 group "/experiment1" (0 members)>
experiment2 <HDF5 group "/experiment2" (2 members)>
experiment2/measurement <HDF5 group "/experiment2/measurement" (0 members)>
experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>
"experiment1" in f
True
"simulation" in f["experiment2"]
True
"experiment3" in f
False
f.flush()
!h5ls -r data.h5
/                        Group
/experiment1             Group
/experiment2             Group
/experiment2/measurement Group
/experiment2/simulation  Group
data1 = np.arange(10)
data2 = np.random.randn(100, 100)
f["array1"] = data1
f["/experiment2/measurement/meas1"] = data2
f.visititems(lambda name, value: print(name, value))
array1 <HDF5 dataset "array1": shape (10,), type "<i8">
experiment1 <HDF5 group "/experiment1" (0 members)>
experiment2 <HDF5 group "/experiment2" (2 members)>
experiment2/measurement <HDF5 group "/experiment2/measurement" (1 members)>
experiment2/measurement/meas1 <HDF5 dataset "meas1": shape (100, 100), type "<f8">
experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>
ds = f["array1"]
ds
<HDF5 dataset "array1": shape (10,), type "<i8">
ds.name
'/array1'
ds.dtype
dtype('int64')
ds.shape
(10,)
ds.len()
10
ds.value
/Users/rob/miniconda3/envs/py3.6/lib/python3.6/site-packages/h5py/_hl/dataset.py:313: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
  "Use dataset[()] instead.", H5pyDeprecationWarning)





array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
ds = f["/experiment2/measurement/meas1"]
ds
<HDF5 dataset "meas1": shape (100, 100), type "<f8">
ds.dtype
dtype('<f8')
ds.shape
(100, 100)
data_full = ds[...]
type(data_full)
numpy.ndarray
data_full.shape
(100, 100)
data_col = ds[:, 0]
data_col.shape
(100,)
ds[10:20:3, 10:20:3]
array([[ 0.60270766, -0.34804638, -0.813596  , -1.29737966],
       [ 0.91320192, -1.06343294,  0.22734595,  0.52759738],
       [ 1.25774422, -0.32775492,  1.4849256 ,  0.28005786],
       [-0.84907287, -0.30000358,  1.79691852, -0.19871506]])
ds[[1,2,3], :].shape
(3, 100)
ds[[1,2,3], :].shape
(3, 100)
mask = ds[:, 0] > 2.0
mask.shape, mask.dtype
((100,), dtype('bool'))
ds[mask, 0]
array([2.04253623, 2.1041854 , 2.05689385])
ds[mask, :5]
array([[ 2.04253623, -0.91946118,  0.11467003, -0.1374237 ,  1.36552692],
       [ 2.1041854 ,  0.22725706, -1.1291663 , -0.28133197, -0.7394167 ],
       [ 2.05689385,  0.18041971, -0.06670925, -0.02835398,  0.48480475]])
# create empty data sets, assign and update datasets
ds = f.create_dataset("array2", data=np.random.randint(10, size=10))
ds
<HDF5 dataset "array2": shape (10,), type "<i8">
ds.value
array([0, 2, 2, 4, 7, 3, 7, 2, 4, 1])
ds = f.create_dataset("/experiment2/simulation/data1", shape=(5, 5), fillvalue=-1)
ds
<HDF5 dataset "data1": shape (5, 5), type "<f4">
ds.value
array([[-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.]], dtype=float32)
ds = f.create_dataset("/experiment1/simulation/data1", shape=(5000, 5000, 5000),
                      fillvalue=0, compression='gzip')
ds
<HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">
ds[:, 0, 0] = np.random.rand(5000)
ds[1, :, 0] += np.random.rand(5000)
ds[:2, :5, 0]
array([[0.6939344 , 0.        , 0.        , 0.        , 0.        ],
       [1.4819994 , 0.01639538, 0.54387355, 0.11130908, 0.9928771 ]],
      dtype=float32)
ds.fillvalue
0.0
f["experiment1"].visititems(lambda name, value: print(name, value))
simulation <HDF5 group "/experiment1/simulation" (1 members)>
simulation/data1 <HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">
float(np.prod(ds.shape) * ds[0,0,0].nbytes) / (1024**3)  # Gb
465.66128730773926
f.flush()
f.filename
'data.h5'
!ls -lh data.h5
-rw-r--r--  1 rob  staff   357K May  6 16:11 data.h5
del f["/experiment1/simulation/data1"]
f["experiment1"].visititems(lambda name, value: print(name, value))
simulation <HDF5 group "/experiment1/simulation" (0 members)>
f.close()
# attributes
f = h5py.File("data.h5")
f.attrs
<Attributes of HDF5 object at 4768620880>
f.attrs["desc"] = "Result sets from experiments and simulations"
f["experiment1"].attrs["date"] = "2015-1-1"
f["experiment2"].attrs["date"] = "2015-1-2"
f["experiment2/simulation/data1"].attrs["k"] = 1.5
f["experiment2/simulation/data1"].attrs["T"] = 1000
list(f["experiment1"].attrs.keys())
['date']
list(f["experiment2/simulation/data1"].attrs.items())
[('T', 1000), ('k', 1.5)]
"T" in f["experiment2/simulation/data1"].attrs
True
del f["experiment2/simulation/data1"].attrs["T"]
"T" in f["experiment2/simulation/data1"].attrs
False
f["experiment2/simulation/data1"].attrs["t"] = np.array([1, 2, 3])
f["experiment2/simulation/data1"].attrs["t"]
array([1, 2, 3])
f.close()

pytables

df = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1)
df = df.set_index("Rank")
df[["Player", "Pos", "GP", "P", "G", "A", "S%", "Shift/GP"]].head(5)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

f = tables.open_file("playerstats-2013-2014.h5", mode="w")
grp = f.create_group("/", "season_2013_2014", title="NHL player statistics for the 2013/2014 season")
grp
/season_2013_2014 (Group) 'NHL player statistics for the 2013/2014 season'
  children := []
f.root
/ (RootGroup) ''
  children := ['season_2013_2014' (Group)]
class PlayerStat(tables.IsDescription):
    player = tables.StringCol(20, dflt="")
    position = tables.StringCol(1, dflt="C")
    games_played = tables.UInt8Col(dflt=0)
    points = tables.UInt16Col(dflt=0)
    goals = tables.UInt16Col(dflt=0)
    assists = tables.UInt16Col(dflt=0)
    shooting_percentage = tables.Float64Col(dflt=0.0)
    shifts_per_game_played = tables.Float64Col(dflt=0.0) 
top30_table = f.create_table(grp, 'top30', PlayerStat, "Top 30 point leaders")
playerstat = top30_table.row
type(playerstat)
tables.tableextension.Row
for index, row_series in df.iterrows():
    playerstat["player"] = row_series["Player"]    
    playerstat["position"] = row_series["Pos"]    
    playerstat["games_played"] = row_series["GP"]    
    playerstat["points"] = row_series["P"]    
    playerstat["goals"] = row_series["G"]
    playerstat["assists"] = row_series["A"] 
    playerstat["shooting_percentage"] = row_series["S%"]
    playerstat["shifts_per_game_played"] = row_series["Shift/GP"]
    playerstat.append()
top30_table.flush()
top30_table.cols.player[:5]
array([b'Sidney Crosby', b'Ryan Getzlaf', b'Claude Giroux',
       b'Tyler Seguin', b'Corey Perry'], dtype='|S20')
top30_table.cols.points[:5]
array([104,  87,  86,  84,  82], dtype=uint16)
def print_playerstat(row):
    print("%20s\t%s\t%s\t%s" %
          (row["player"].decode('UTF-8'), row["points"], row["goals"], row["assists"]))
for row in top30_table.iterrows():
    print_playerstat(row)
       Sidney Crosby	104	36	68
        Ryan Getzlaf	87	31	56
       Claude Giroux	86	28	58
        Tyler Seguin	84	37	47
         Corey Perry	82	43	39
         Phil Kessel	80	37	43
         Taylor Hall	80	27	53
       Alex Ovechkin	79	51	28
        Joe Pavelski	79	41	38
          Jamie Benn	79	34	45
   Nicklas Backstrom	79	18	61
       Patrick Sharp	78	34	44
        Joe Thornton	76	11	65
       Erik Karlsson	74	20	54
       Evgeni Malkin	72	23	49
     Patrick Marleau	70	33	37
        Anze Kopitar	70	29	41
        Matt Duchene	70	23	47
    Martin St. Louis	69	30	39
        Patrick Kane	69	29	40
       Blake Wheeler	69	28	41
         Kyle Okposo	69	27	42
        David Krejci	69	19	50
        Chris Kunitz	68	35	33
      Jonathan Toews	68	28	40
        Thomas Vanek	68	27	41
        Jaromir Jagr	67	24	43
        John Tavares	66	24	42
        Jason Spezza	66	23	43
       Jordan Eberle	65	28	37
for row in top30_table.where("(points > 75) & (points <= 80)"):
    print_playerstat(row)
         Phil Kessel	80	37	43
         Taylor Hall	80	27	53
       Alex Ovechkin	79	51	28
        Joe Pavelski	79	41	38
          Jamie Benn	79	34	45
   Nicklas Backstrom	79	18	61
       Patrick Sharp	78	34	44
        Joe Thornton	76	11	65
for row in top30_table.where("(goals > 40) & (points < 80)"):
    print_playerstat(row)
       Alex Ovechkin	79	51	28
        Joe Pavelski	79	41	38
f
File(filename=playerstats-2013-2014.h5, title='', mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) ''
/season_2013_2014 (Group) 'NHL player statistics for the 2013/2014 season'
/season_2013_2014/top30 (Table(30,)) 'Top 30 point leaders'
  description := {
  "assists": UInt16Col(shape=(), dflt=0, pos=0),
  "games_played": UInt8Col(shape=(), dflt=0, pos=1),
  "goals": UInt16Col(shape=(), dflt=0, pos=2),
  "player": StringCol(itemsize=20, shape=(), dflt=b'', pos=3),
  "points": UInt16Col(shape=(), dflt=0, pos=4),
  "position": StringCol(itemsize=1, shape=(), dflt=b'C', pos=5),
  "shifts_per_game_played": Float64Col(shape=(), dflt=0.0, pos=6),
  "shooting_percentage": Float64Col(shape=(), dflt=0.0, pos=7)}
  byteorder := 'little'
  chunkshape := (1489,)
f.flush()
f.close()
!h5ls -rv playerstats-2013-2014.h5
Opened "playerstats-2013-2014.h5" with sec2 driver.
/                        Group
    Attribute: CLASS scalar
        Type:      5-byte null-terminated UTF-8 string
        Data:  "GROUP"
    Attribute: PYTABLES_FORMAT_VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
        Data:  "2.1"
    Attribute: TITLE null
        Type:      1-byte null-terminated UTF-8 string

    Attribute: VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
        Data:  "1.0"
    Location:  1:96
    Links:     1
/season_2013_2014        Group
    Attribute: CLASS scalar
        Type:      5-byte null-terminated UTF-8 string
        Data:  "GROUP"
    Attribute: TITLE scalar
        Type:      46-byte null-terminated UTF-8 string
        Data:  "NHL player statistics for the 2013/2014 season"
    Attribute: VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
        Data:  "1.0"
    Location:  1:1024
    Links:     1
/season_2013_2014/top30  Dataset {30/Inf}
    Attribute: CLASS scalar
        Type:      5-byte null-terminated UTF-8 string
        Data:  "TABLE"
    Attribute: FIELD_0_FILL scalar
        Type:      native unsigned short
        Data:  0
    Attribute: FIELD_0_NAME scalar
        Type:      7-byte null-terminated UTF-8 string
        Data:  "assists"
    Attribute: FIELD_1_FILL scalar
        Type:      native unsigned char
        Data:  0
    Attribute: FIELD_1_NAME scalar
        Type:      12-byte null-terminated UTF-8 string
        Data:  "games_played"
    Attribute: FIELD_2_FILL scalar
        Type:      native unsigned short
        Data:  0
    Attribute: FIELD_2_NAME scalar
        Type:      5-byte null-terminated UTF-8 string
        Data:  "goals"
    Attribute: FIELD_3_FILL scalar
        Type:      1-byte null-terminated ASCII string
        Data:  ""
    Attribute: FIELD_3_NAME scalar
        Type:      6-byte null-terminated UTF-8 string
        Data:  "player"
    Attribute: FIELD_4_FILL scalar
        Type:      native unsigned short
        Data:  0
    Attribute: FIELD_4_NAME scalar
        Type:      6-byte null-terminated UTF-8 string
        Data:  "points"
    Attribute: FIELD_5_FILL scalar
        Type:      1-byte null-terminated ASCII string
        Data:  "C"
    Attribute: FIELD_5_NAME scalar
        Type:      8-byte null-terminated UTF-8 string
        Data:  "position"
    Attribute: FIELD_6_FILL scalar
        Type:      native double
        Data:  0
    Attribute: FIELD_6_NAME scalar
        Type:      22-byte null-terminated UTF-8 string
        Data:  "shifts_per_game_played"
    Attribute: FIELD_7_FILL scalar
        Type:      native double
        Data:  0
    Attribute: FIELD_7_NAME scalar
        Type:      19-byte null-terminated UTF-8 string
        Data:  "shooting_percentage"
    Attribute: NROWS scalar
        Type:      native long
        Data:  30
    Attribute: TITLE scalar
        Type:      20-byte null-terminated UTF-8 string
        Data:  "Top 30 point leaders"
    Attribute: VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
        Data:  "2.7"
    Location:  1:2264
    Links:     1
    Chunks:    {1489} 65516 bytes
    Storage:   1320 logical bytes, 65516 allocated bytes, 2.01% utilization
    Type:      struct {
                   "assists"          +0    native unsigned short
                   "games_played"     +2    native unsigned char
                   "goals"            +3    native unsigned short
                   "player"           +5    20-byte null-terminated ASCII string
                   "points"           +25   native unsigned short
                   "position"         +27   1-byte null-terminated ASCII string
                   "shifts_per_game_played" +28   native double
                   "shooting_percentage" +36   native double
               } 44 bytes
H5tools-DIAG: Error detected in HDF5:tools (1.8.14) thread 0:
  #000: h5tools_dump.c line 1843 in h5tools_dump_mem(): H5Sis_simple failed
    major: Failure in tools library
    minor: error in function

Pandas hdfstore

import pandas as pd
store = pd.HDFStore('store.h5')
df = pd.DataFrame(np.random.rand(5,5))
store["df1"] = df
df = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1)
store["df2"] = df
store.keys()
['/df1', '/df2']
'df2' in store
True
df = store["df1"]
store.root
/ (RootGroup) ''
  children := ['df1' (Group), 'df2' (Group)]
store.close()
f = h5py.File("store.h5")
f.visititems(lambda x, y: print(x, "\t" * int(3 - len(str(x))//8), y))
df1 			 <HDF5 group "/df1" (4 members)>
df1/axis0 		 <HDF5 dataset "axis0": shape (5,), type "<i8">
df1/axis1 		 <HDF5 dataset "axis1": shape (5,), type "<i8">
df1/block0_items 	 <HDF5 dataset "block0_items": shape (5,), type "<i8">
df1/block0_values 	 <HDF5 dataset "block0_values": shape (5, 5), type "<f8">
df2 			 <HDF5 group "/df2" (8 members)>
df2/axis0 		 <HDF5 dataset "axis0": shape (21,), type "|S8">
df2/axis1 		 <HDF5 dataset "axis1": shape (30,), type "<i8">
df2/block0_items 	 <HDF5 dataset "block0_items": shape (3,), type "|S8">
df2/block0_values 	 <HDF5 dataset "block0_values": shape (30, 3), type "<f8">
df2/block1_items 	 <HDF5 dataset "block1_items": shape (14,), type "|S4">
df2/block1_values 	 <HDF5 dataset "block1_values": shape (30, 14), type "<i8">
df2/block2_items 	 <HDF5 dataset "block2_items": shape (4,), type "|S6">
df2/block2_values 	 <HDF5 dataset "block2_values": shape (1,), type "|O">
f["/df2/block0_items"].value          
array([b'S%', b'Shift/GP', b'FO%'], dtype='|S8')
f["/df2/block0_values"][:3]
array([[13.9, 24. , 52.5],
       [15.2, 25.2, 49. ],
       [12.6, 25.1, 52.9]])
f["/df2/block1_items"].value  
array([b'Rank', b'GP', b'G', b'A', b'P', b'+/-', b'PIM', b'PPG', b'PPP',
       b'SHG', b'SHP', b'GW', b'OT', b'S'], dtype='|S4')
f["/df2/block1_values"][:3, :5]
array([[  1,  80,  36,  68, 104],
       [  2,  77,  31,  56,  87],
       [  3,  82,  28,  58,  86]])

JSON

data = ["string", 1.0, 2, None]
data_json = json.dumps(data)
data_json
'["string", 1.0, 2, null]'
data2 = json.loads(data_json)
data
['string', 1.0, 2, None]
data[0]
'string'
data = {"one": 1, "two": 2.0, "three": "three"}
data_json = json.dumps(data)
print(data_json)
{"one": 1, "two": 2.0, "three": "three"}
data = json.loads(data_json)
data["two"]
2.0
data["three"]
'three'
data = {"one": [1], 
        "two": [1, 2], 
        "three": [1, 2, 3]}
data_json = json.dumps(data, indent=True)
print(data_json)
{
 "one": [
  1
 ],
 "two": [
  1,
  2
 ],
 "three": [
  1,
  2,
  3
 ]
}
data = {"one": [1], 
        "two": {"one": 1, "two": 2}, 
        "three": [(1,), (1, 2), (1, 2, 3)],
        "four": "a text string"}
with open("data.json", "w") as f:
    json.dump(data, f)
!cat data.json
{"one": [1], "two": {"one": 1, "two": 2}, "three": [[1], [1, 2], [1, 2, 3]], "four": "a text string"}
with open("data.json", "r") as f:
    data_from_file = json.load(f)
data_from_file["two"]
{'one': 1, 'two': 2}
data_from_file["three"]
[[1], [1, 2], [1, 2, 3]]
!head -n 20 tokyo-metro.json
{
    "C": {
        "color": "#149848", 
        "transfers": [
            [
                "C3", 
                "F15"
            ], 
            [
                "C4", 
                "Z2"
            ], 
            [
                "C4", 
                "G2"
            ], 
            [
                "C7", 
                "M14"
            ], 
!wc tokyo-metro.json
    1471    1508   27638 tokyo-metro.json
with open("tokyo-metro.json", "r") as f:
    data = json.load(f)
data.keys()
dict_keys(['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z'])
data["C"].keys()
dict_keys(['color', 'transfers', 'travel_times'])
data["C"]["color"]
'#149848'
data["C"]["transfers"]
[['C3', 'F15'],
 ['C4', 'Z2'],
 ['C4', 'G2'],
 ['C7', 'M14'],
 ['C7', 'N6'],
 ['C7', 'G6'],
 ['C8', 'M15'],
 ['C8', 'H6'],
 ['C9', 'H7'],
 ['C9', 'Y18'],
 ['C11', 'T9'],
 ['C11', 'M18'],
 ['C11', 'Z8'],
 ['C12', 'M19'],
 ['C18', 'H21']]
[(s, e, tt) for s, e, tt in data["C"]["travel_times"] if tt == 1]
[('C3', 'C4', 1), ('C7', 'C8', 1), ('C9', 'C10', 1)]
data
{'C': {'color': '#149848',
  'transfers': [['C3', 'F15'],
   ['C4', 'Z2'],
   ['C4', 'G2'],
   ['C7', 'M14'],
   ['C7', 'N6'],
   ['C7', 'G6'],
   ['C8', 'M15'],
   ['C8', 'H6'],
   ['C9', 'H7'],
   ['C9', 'Y18'],
   ['C11', 'T9'],
   ['C11', 'M18'],
   ['C11', 'Z8'],
   ['C12', 'M19'],
   ['C18', 'H21']],
  'travel_times': [['C1', 'C2', 2],
   ['C2', 'C3', 2],
   ['C3', 'C4', 1],
   ['C4', 'C5', 2],
   ['C5', 'C6', 2],
   ['C6', 'C7', 2],
   ['C7', 'C8', 1],
   ['C8', 'C9', 3],
   ['C9', 'C10', 1],
   ['C10', 'C11', 2],
   ['C11', 'C12', 2],
   ['C12', 'C13', 2],
   ['C13', 'C14', 2],
   ['C14', 'C15', 2],
   ['C15', 'C16', 2],
   ['C16', 'C17', 3],
   ['C17', 'C18', 3],
   ['C18', 'C19', 3]]},
 'G': {'color': '#f59230',
  'transfers': [['G1', 'Z1'],
   ['G1', 'F16'],
   ['G2', 'Z2'],
   ['G2', 'C4'],
   ['G4', 'Z3'],
   ['G5', 'M13'],
   ['G5', 'Y16'],
   ['G5', 'Z4'],
   ['G5', 'N7'],
   ['G6', 'N6'],
   ['G6', 'M14'],
   ['G6', 'C7'],
   ['G9', 'M16'],
   ['G9', 'H8'],
   ['G11', 'T10'],
   ['G12', 'Z9'],
   ['G15', 'H16'],
   ['G16', 'H17']],
  'travel_times': [['G1', 'G2', 2],
   ['G2', 'G3', 1],
   ['G3', 'G4', 2],
   ['G4', 'G5', 2],
   ['G5', 'G6', 2],
   ['G6', 'G7', 2],
   ['G7', 'G8', 2],
   ['G8', 'G9', 2],
   ['G9', 'G10', 1],
   ['G10', 'G11', 2],
   ['G11', 'G12', 2],
   ['G12', 'G13', 1],
   ['G13', 'G14', 2],
   ['G14', 'G15', 2],
   ['G15', 'G16', 1],
   ['G16', 'G17', 2],
   ['G17', 'G18', 1],
   ['G18', 'G19', 2]]},
 'F': {'color': '#b96528',
  'transfers': [['F1', 'Y1'],
   ['F2', 'Y2'],
   ['F3', 'Y3'],
   ['F4', 'Y4'],
   ['F5', 'Y5'],
   ['F6', 'Y6'],
   ['F7', 'Y7'],
   ['F8', 'Y8'],
   ['F9', 'Y9'],
   ['F9', 'M25'],
   ['F13', 'M9'],
   ['F15', 'C3'],
   ['F16', 'Z1'],
   ['F16', 'G1']],
  'travel_times': [['F1', 'F2', 3],
   ['F2', 'F3', 2],
   ['F3', 'F4', 3],
   ['F4', 'F5', 2],
   ['F5', 'F6', 2],
   ['F6', 'F7', 2],
   ['F7', 'F8', 2],
   ['F8', 'F9', 2],
   ['F9', 'F10', 3],
   ['F10', 'F11', 2],
   ['F11', 'F12', 2],
   ['F12', 'F13', 2],
   ['F13', 'F14', 3],
   ['F14', 'F15', 2],
   ['F15', 'F16', 2]]},
 'H': {'color': '#9cacb5',
  'transfers': [['H6', 'M15'],
   ['H6', 'C8'],
   ['H7', 'Y18'],
   ['H7', 'C9'],
   ['H8', 'M16'],
   ['H8', 'G9'],
   ['H12', 'T11'],
   ['H16', 'G15'],
   ['H17', 'G16'],
   ['H21', 'C18']],
  'travel_times': [['H1', 'H2', 3],
   ['H2', 'H3', 3],
   ['H3', 'H4', 3],
   ['H4', 'H5', 3],
   ['H5', 'H6', 2],
   ['H6', 'H7', 3],
   ['H7', 'H8', 1],
   ['H8', 'H9', 2],
   ['H9', 'H10', 2],
   ['H10', 'H11', 2],
   ['H11', 'H12', 1],
   ['H12', 'H13', 3],
   ['H13', 'H14', 1],
   ['H14', 'H15', 2],
   ['H15', 'H16', 2],
   ['H16', 'H17', 1],
   ['H17', 'H18', 2],
   ['H18', 'H19', 2],
   ['H19', 'H20', 2],
   ['H20', 'H21', 3]]},
 'M': {'color': '#ff0000',
  'transfers': [['M9', 'F13'],
   ['M12', 'N8'],
   ['M13', 'G5'],
   ['M13', 'Y16'],
   ['M13', 'Z4'],
   ['M13', 'N7'],
   ['M14', 'C7'],
   ['M14', 'G6'],
   ['M14', 'N6'],
   ['M15', 'H6'],
   ['M15', 'C8'],
   ['M16', 'G9'],
   ['M16', 'H8'],
   ['M18', 'T9'],
   ['M18', 'C11'],
   ['M18', 'Z8'],
   ['M19', 'C12'],
   ['M22', 'N11'],
   ['M25', 'Y9'],
   ['M25', 'F9']],
  'travel_times': [['M1', 'M2', 2],
   ['M2', 'M3', 2],
   ['M3', 'M4', 2],
   ['M4', 'M5', 2],
   ['M5', 'M6', 2],
   ['M6', 'M7', 2],
   ['M7', 'M8', 2],
   ['M8', 'M9', 2],
   ['M9', 'M10', 1],
   ['M10', 'M11', 2],
   ['M11', 'M12', 2],
   ['M12', 'M13', 3],
   ['M13', 'M14', 2],
   ['M14', 'M15', 1],
   ['M15', 'M16', 3],
   ['M16', 'M17', 2],
   ['M17', 'M18', 2],
   ['M18', 'M19', 2],
   ['M19', 'M20', 1],
   ['M20', 'M21', 2],
   ['M21', 'M22', 2],
   ['M22', 'M23', 3],
   ['M23', 'M24', 2],
   ['M24', 'M25', 3],
   ['m3', 'm4', 2],
   ['m4', 'm5', 2],
   ['m5', 'M6', 2]]},
 'N': {'color': '#1aaca9',
  'transfers': [['N1', 'T1'],
   ['N2', 'T2'],
   ['N3', 'T3'],
   ['N6', 'G6'],
   ['N6', 'M14'],
   ['N6', 'C7'],
   ['N7', 'Y16'],
   ['N7', 'Z4'],
   ['N7', 'G5'],
   ['N7', 'M13'],
   ['N8', 'M12'],
   ['N9', 'Y14'],
   ['N10', 'Y13'],
   ['N10', 'T6'],
   ['N11', 'M22']],
  'travel_times': [['N1', 'N2', 2],
   ['N2', 'N3', 2],
   ['N3', 'N4', 2],
   ['N4', 'N5', 2],
   ['N5', 'N6', 2],
   ['N6', 'N7', 2],
   ['N7', 'N8', 2],
   ['N8', 'N9', 2],
   ['N9', 'N10', 2],
   ['N10', 'N11', 2],
   ['N11', 'N12', 3],
   ['N12', 'N13', 2],
   ['N13', 'N14', 2],
   ['N14', 'N15', 3],
   ['N15', 'N16', 1],
   ['N16', 'N17', 3],
   ['N17', 'N18', 2],
   ['N18', 'N19', 2]]},
 'T': {'color': '#1aa7d8',
  'transfers': [['T6', 'N10'],
   ['T6', 'Y13'],
   ['T7', 'Z6'],
   ['T9', 'M18'],
   ['T9', 'C11'],
   ['T9', 'Z8'],
   ['T10', 'G11'],
   ['T11', 'H12']],
  'travel_times': [['T1', 'T2', 0],
   ['T2', 'T3', 3],
   ['T3', 'T4', 6],
   ['T4', 'T5', 9],
   ['T5', 'T6', 11],
   ['T6', 'T7', 13],
   ['T7', 'T8', 14],
   ['T8', 'T9', 16],
   ['T9', 'T10', 18],
   ['T10', 'T11', 20],
   ['T11', 'T12', 21],
   ['T12', 'T13', 24],
   ['T13', 'T14', 26],
   ['T14', 'T15', 27],
   ['T15', 'T16', 30],
   ['T16', 'T17', 33],
   ['T17', 'T18', 35],
   ['T18', 'T19', 37],
   ['T19', 'T20', 39],
   ['T20', 'T21', 41],
   ['T21', 'T22', 43],
   ['T22', 'T23', 46],
   ['T23', 'T24', 49]]},
 'Y': {'color': '#ede7c3',
  'transfers': [['Y1', 'F1'],
   ['Y2', 'F2'],
   ['Y3', 'F3'],
   ['Y4', 'F4'],
   ['Y5', 'F5'],
   ['Y6', 'F6'],
   ['Y7', 'F7'],
   ['Y8', 'F8'],
   ['Y9', 'F9'],
   ['Y9', 'M25'],
   ['Y13', 'T6'],
   ['Y13', 'N10'],
   ['Y14', 'N9'],
   ['Y16', 'Z4'],
   ['Y16', 'N7'],
   ['Y16', 'G5'],
   ['Y16', 'M13'],
   ['Y18', 'H7'],
   ['Y18', 'C9']],
  'travel_times': [['Y1', 'Y2', 4],
   ['Y2', 'Y3', 2],
   ['Y3', 'Y4', 3],
   ['Y4', 'Y5', 2],
   ['Y5', 'Y6', 2],
   ['Y6', 'Y7', 2],
   ['Y7', 'Y8', 2],
   ['Y8', 'Y9', 3],
   ['Y9', 'Y10', 2],
   ['Y10', 'Y11', 2],
   ['Y11', 'Y12', 2],
   ['Y12', 'Y13', 3],
   ['Y13', 'Y14', 2],
   ['Y14', 'Y15', 2],
   ['Y15', 'Y16', 1],
   ['Y16', 'Y17', 2],
   ['Y17', 'Y18', 2],
   ['Y18', 'Y19', 2],
   ['Y19', 'Y20', 2],
   ['Y20', 'Y21', 2],
   ['Y21', 'Y22', 2],
   ['Y22', 'Y23', 3],
   ['Y23', 'Y24', 2]]},
 'Z': {'color': '#a384bf',
  'transfers': [['Z1', 'F16'],
   ['Z1', 'G1'],
   ['Z2', 'C4'],
   ['Z2', 'G2'],
   ['Z3', 'G4'],
   ['Z4', 'Y16'],
   ['Z4', 'N7'],
   ['Z4', 'M13'],
   ['Z4', 'G5'],
   ['Z6', 'T7'],
   ['Z8', 'M18'],
   ['Z8', 'C11'],
   ['Z8', 'T9'],
   ['Z9', 'G12']],
  'travel_times': [['Z1', 'Z2', 3],
   ['Z2', 'Z3', 2],
   ['Z3', 'Z4', 2],
   ['Z4', 'Z5', 2],
   ['Z5', 'Z6', 2],
   ['Z6', 'Z7', 2],
   ['Z7', 'Z8', 2],
   ['Z8', 'Z9', 2],
   ['Z9', 'Z10', 3],
   ['Z10', 'Z11', 3],
   ['Z11', 'Z12', 3],
   ['Z12', 'Z13', 2],
   ['Z13', 'Z14', 2]]}}
!ls -lh tokyo-metro.json
-rw-r--r--  1 rob  staff    27K Mar 25  2018 tokyo-metro.json
data_pack = msgpack.packb(data)
# del data
type(data_pack)
bytes
len(data_pack)
3021
with open("tokyo-metro.msgpack", "wb") as f:
    f.write(data_pack)
!ls -lh tokyo-metro.msgpack
-rw-r--r--  1 rob  staff   3.0K May  6 16:12 tokyo-metro.msgpack
with open("tokyo-metro.msgpack", "rb") as f:
    data_msgpack = f.read()
    data = msgpack.unpackb(data_msgpack)
list(data.keys())
[b'C', b'G', b'F', b'H', b'M', b'N', b'T', b'Y', b'Z']
with open("tokyo-metro.pickle", "wb") as f:
    pickle.dump(data, f)
del data
!ls -lh tokyo-metro.pickle
-rw-r--r--  1 rob  staff   8.5K May  6 16:12 tokyo-metro.pickle
with open("tokyo-metro.pickle", "rb") as f:
    data = pickle.load(f)
data.keys()
dict_keys([b'C', b'G', b'F', b'H', b'M', b'N', b'T', b'Y', b'Z'])

Versions

%reload_ext version_information
%version_information numpy, pandas, csv, json, tables, h5py, msgpack