""" Module that implements Energy_Network, the machine learned correcting functional (MLCF)
for energies
"""
import numpy as np
import tensorflow as tf
import pandas as pd
import os
import sys
from sklearn.model_selection import train_test_split
from .ml_util import *
try:
from .preprocessing import *
except ImportError:
print('Preprocessing module not loaded')
from matplotlib import pyplot as plt
import math
import pickle
from collections import namedtuple
import h5py
import json
from ase.io import read
import mlc_func.elf as elf
Dataset = namedtuple("Dataset", "data species")
[docs]class Energy_Network():
""" Machine learned correcting functional (MLCF) for energies
Parameters
----------
subnets: list of Subnetwork
each subnetwork belongs to a single atom inside the system
and computes the atomic contributio to the total energy
"""
def __init__(self, subnets):
if not isinstance(subnets, list):
self.subnets = [subnets]
else:
self.subnets = subnets
self.model_loaded = False
self.rand_state = np.random.get_state()
self.graph = None
self.target_mean = 0
self.target_std = 1
self.sess = None
self.graph = None
self.initialized = False
self.optimizer = None
self.checkpoint_path = None
self.masks = {}
self.species_nets = {}
self.species_nets_names = {}
self.species_gradients_names = {}
scale_together(subnets)
# ========= Network operations ============ #
def __add__(self, other):
if isinstance(other, Subnet):
if not len(self.subnets) == 1:
raise Exception(" + operator only valid if only one training set contained")
else:
self.subnets[0] += [other]
else:
raise Exception("Datatypes not compatible")
return self
def __mod__(self, other):
if isinstance(other, Subnet):
self.subnets += [[other]]
elif isinstance(other, Energy_Network):
self.subnets += other.subnets
else:
raise Exception("Datatypes not compatible")
return self
def reset(self):
self.sess = None
self.graph = None
self.initialized = False
self.optimizer = None
self.checkpoint_path = None
[docs] def construct_network(self):
""" Builds the tensorflow graph from subnets
"""
cnt = 0
logits = []
for subnet in self.subnets:
if isinstance(subnet,list):
sublist = []
for s in subnet:
sublist.append(s.get_logits(cnt)[0])
cnt += 1
logits.append(sublist)
else:
logits.append(subnet.get_logits(cnt)[0])
cnt += 1
return logits
def sample_training(self, percentage):
for subnet in self.subnets:
n_samples = len(subnet[0].y_train)
n_select = np.ceil(percentage*n_samples).astype(int)
if n_select < 2: n_select = 2
selection = np.arange(n_samples).astype(int)
np.random.shuffle(selection)
selection = selection[:n_select]
for s in subnet:
s.X_train = s.X_train[:,selection]
s.y_train = s.y_train[selection]
[docs] def get_feed(self, which = 'train', train_valid_split = 0.8, seed = 42):
""" Return a dictionary that can be used as a feed_dict in tensorflow
Parameters
-----------
which: {'train',test'}
which part of the dataset is used
train_valid_split: float
ratio of train and validation set size
seed: int
seed parameter for the random shuffle algorithm, make
Returns
--------
(dictionary, dictionary)
either (training feed dictionary, validation feed dict.)
or (testing feed dictionary, None)
"""
train_feed_dict = {}
valid_feed_dict = {}
test_feed_dict = {}
for subnet in self.subnets:
if isinstance(subnet,list):
for s in subnet:
train_feed_dict.update(s.get_feed('train', train_valid_split, seed))
valid_feed_dict.update(s.get_feed('valid', train_valid_split, seed))
test_feed_dict.update(s.get_feed('test', train_valid_split, seed))
else:
train_feed_dict.update(subnet.get_feed('train', train_valid_split, seed))
valid_feed_dict.update(subnet.get_feed('valid', train_valid_split, seed))
test_feed_dict.update(subnet.get_feed('test', seed = seed))
if which == 'train':
return train_feed_dict, valid_feed_dict
elif which == 'test':
return test_feed_dict, None
[docs] def get_cost(self):
""" Build the tensorflow node that defines the cost function
Returns
-------
cost_list: [tensorflow.placeholder]
list of costs for subnets. subnets
whose outputs are added together share cost functions
"""
cost_list = []
for subnet in self.subnets:
if isinstance(subnet,list):
cost = 0
y_ = self.graph.get_tensor_by_name(subnet[0].y_name)
log = 0
for s in subnet:
log += self.graph.get_tensor_by_name(s.logits_name)
cost += tf.reduce_mean(tf.reduce_mean(tf.square(y_-log),0))
else:
log = self.graph.get_tensor_by_name(subnet.logits_name)
y_ = self.graph.get_tensor_by_name(subnet.y_name)
cost = tf.reduce_mean(tf.reduce_mean(tf.square(y_-log),0))
cost_list.append(cost)
return cost_list
[docs] def train(self,
step_size=0.01,
max_steps=50001,
b_=0,
verbose=True,
optimizer=None,
adaptive_rate=False,
multiplier = 1.0):
""" Train the master neural network
Parameters
----------
step_size: float
step size for gradient descent
max_steps: int
number of training epochs
b: float
regularization parameter
verbose: boolean
print cost for intermediate training epochs
optimizer: tf.nn.GradientDescentOptimizer,tf.nn.AdamOptimizer, ...
adaptive_rate: boolean
wether to adjust step_size if cost increases
not recommended for AdamOptimizer
multiplier: list of float
multiplier that allow to give datasets more
weight than others
Returns
--------
None
"""
self.model_loaded = True
if self.graph is None:
self.graph = tf.Graph()
build_graph = True
else:
build_graph = False
with self.graph.as_default():
if self.sess == None:
sess = tf.Session()
self.sess = sess
else:
sess = self.sess
# Get number of distinct subnet species
species = {}
for net in self.subnets:
if isinstance(net,list):
for net in net:
for l,_ in enumerate(net.layers):
name = net.species
species[name] = 1
else:
for l,_ in enumerate(net.layers):
name = net.species
species[name] = 1
n_species = len(species)
# Build all the required tensors
b = {}
if build_graph:
self.construct_network()
for s in species:
b[s] = tf.placeholder(tf.float32,name = '{}/b'.format(s))
else:
for s in species:
b[s] = self.graph.get_tensor_by_name('{}/b:0'.format(s))
cost_list = self.get_cost()
train_feed_dict, valid_feed_dict = self.get_feed('train')
cost = 0
if not isinstance(multiplier, list):
multiplier = [1.0]*len(cost_list)
print('multipliers: {}'.format(multiplier))
for c, m in zip(cost_list, multiplier):
cost += c*m
# L2-loss
loss = 0
with tf.variable_scope("", reuse=True):
for net in self.subnets:
if isinstance(net,list):
for net in net:
for l, layer in enumerate(net.layers):
name = net.species
loss += tf.nn.l2_loss(tf.get_variable("{}/W{}".format(name, l+1))) * \
b[name]/layer
else:
for l, layer in enumerate(net.layers):
name = net.species
loss += tf.nn.l2_loss(tf.get_variable("{}/W{}".format(name, l+1))) * \
b[name]/layer
cost += loss
for i, s in enumerate(species):
train_feed_dict['{}/b:0'.format(s)] = b_[i]
valid_feed_dict['{}/b:0'.format(s)] = 0
if self.optimizer == None:
if optimizer == None:
self.optimizer = tf.train.AdamOptimizer(learning_rate = step_size)
else:
self.optimizer = optimizer
train_step = self.optimizer.minimize(cost)
# Workaround to load the AdamOptimizer variables
if not self.checkpoint_path == None:
saver = tf.train.Saver()
saver.restore(self.sess,self.checkpoint_path)
self.checkpoint_path = None
initialize_uninitialized(self.sess)
self.initialized = True
train_writer = tf.summary.FileWriter('./log/',
self.graph)
old_cost = 1e8
for _ in range(0,max_steps):
sess.run(train_step,feed_dict=train_feed_dict)
if _%int(max_steps/100) == 0 and adaptive_rate == True:
new_cost = sess.run(tf.sqrt(cost),
feed_dict=train_feed_dict)
if new_cost > old_cost:
step_size /= 2
print('Step size decreased to {}'.format(step_size))
train_step = tf.train.GradientDescentOptimizer(step_size).minimize(cost)
old_cost = new_cost
if _%int(max_steps/10) == 0 and verbose:
print('Step: ' + str(_))
print('Training set loss:')
if len(cost_list) > 1:
for i, c in enumerate(cost_list):
print('{}: {}'.format(i,sess.run(tf.sqrt(c),feed_dict=train_feed_dict)))
print('Total: {}'.format(sess.run(tf.sqrt(cost-loss),feed_dict=train_feed_dict)))
print('Validation set loss:')
if len(cost_list) > 1:
for i, c in enumerate(cost_list):
print('{}: {}'.format(i,sess.run(tf.sqrt(c),feed_dict=valid_feed_dict)))
print('Total: {}'.format(sess.run(tf.sqrt(cost),feed_dict=valid_feed_dict)))
print('--------------------')
print('L2-loss: {}'.format(sess.run(loss,feed_dict=train_feed_dict)))
[docs] def predict(self, features, species, use_masks = False, return_gradient = False):
""" Get predicted energies
Parameters
----------
features: np.ndarray
input features
species: str
predict atomic contribution to energy for this species
use_masks: bool
whether masks should be applied to the provided features
return_gradient: bool
instead of returning energies, return gradient of network
w.r.t. input features
Returns
-------
np.ndarray
predicted energies or gradient
"""
species = species.lower()
if features.ndim == 2:
features = features.reshape(-1,1,features.shape[1])
else:
raise Exception('features.ndim != 2')
if use_masks:
features = features[:,:,self.masks[species]]
ds = Dataset(features, species)
targets = np.zeros(features.shape[0])
if not species in self.species_nets:
self.species_nets[species] = Subnet()
found = False
snet = self.species_nets[species]
for s in self.subnets:
if found == True:
break
if isinstance(s,list):
for s2 in s:
if s2.species == ds.species:
snet.scaler = s2.scaler
snet.layers = s2.layers
snet.targets = s2.targets
snet.activations = s2.activations
print("Sharing scaler with species " + s2.species)
found = True
break
else:
if s.species == ds.species:
snet.scaler = s.scaler
snet.layers = s.layers
snet.targets = s.targets
snet.activations = s.activations
print("Sharing scaler with species " + s.species)
break
snet = self.species_nets[species]
snet.add_dataset(ds, targets, test_size=0.0)
if not self.model_loaded:
raise Exception('Model not loaded!')
else:
with self.graph.as_default():
if species in self.species_nets_names:
logits = self.graph.get_tensor_by_name(self.species_nets_names[species])
gradients = self.graph.get_tensor_by_name(self.species_gradients_names[species])
else:
logits, x, _ = snet.get_logits(1)
gradients = tf.gradients(logits, x)[0].values
self.species_nets_names[species] = logits.name
self.species_gradients_names[species] = gradients.name
sess = self.sess
energies = sess.run(logits, feed_dict=snet.get_feed(which='train',
train_valid_split=1.0))
if return_gradient:
grad = sess.run(gradients, feed_dict=snet.get_feed(which='train',
train_valid_split=1.0))[0]
grad = grad/np.sqrt(snet.scaler.var_).reshape(1,-1)
energies = (energies, grad)
return energies
def get_weights(self, species, subnet):
W = []
b = []
with self.graph.as_default():
with tf.variable_scope("", reuse=True):
for l, layer in enumerate(subnet.layers + ['']):
W.append(self.sess.run(tf.get_variable("{}/W{}".format(species, l + 1))))
b.append(self.sess.run(tf.get_variable("{}/b{}".format(species, l + 1))))
return W, b
[docs] def get_energies(self, summarize = True, which = 'train'):
""" Uses trained model on training or test sets
Parameters
-----------
which: str
{'train','test'} which set logits are computed for
Returns
--------
list of numpy.ndarray
resulting energies grouped by independent subnet datasets
"""
if not self.model_loaded:
raise Exception('Model not loaded!')
else:
with self.graph.as_default():
logits_list = self.construct_network()
sess = self.sess
feed_dict, _ = self.get_feed(train_valid_split = 1.0, which = which)
return_list = []
for logits in logits_list:
if isinstance(logits,list):
result = 0
for logits in logits:
if summarize:
result += sess.run(logits,feed_dict=feed_dict)
else:
return_list.append(sess.run(logits,feed_dict=feed_dict))
if summarize:
return_list.append(result)
else:
return_list.append(sess.run(logits,feed_dict=feed_dict))
return return_list
[docs] def save_model(self, path):
""" Save trained model to path
"""
if path[-5:] == '.ckpt':
path = path[:-5]
with self.graph.as_default():
sess = self.sess
saver = tf.train.Saver()
saver.save(sess,save_path = path + '.ckpt')
[docs] def restore_model(self, path):
""" Load trained model from path
"""
if path[-5:] == '.ckpt':
path = path[:-5]
self.checkpoint_path = path + '.ckpt'
g = tf.Graph()
with g.as_default():
sess = tf.Session()
self.construct_network()
b = tf.placeholder(tf.float32,name = 'b')
saver = tf.train.Saver()
saver.restore(sess,path + '.ckpt')
self.model_loaded = True
self.sess = sess
self.graph = g
self.initialized = True
[docs] def save_all(self, net_dir, override = False):
""" Saves the model including all subnets and datasets
using pickle to directory net_dir, if directory exists only
save if override = True
"""
try:
os.mkdir(net_dir)
except FileExistsError:
if override:
print('Overriding...')
import shutil
shutil.rmtree(net_dir)
os.mkdir(net_dir)
else:
print('Directory/Network already exists. Network not saved...')
return None
# Pickle does not seem to be compatible with tensorflow so just
# save subnetworks with it
with open(os.path.join(net_dir,'subnets'),'wb') as file:
pickle.dump(self.subnets,file)
to_save = {'mask': self.masks}
self.save_model(os.path.join(net_dir,'model'))
pickle.dump(to_save, open(os.path.join(net_dir,'supp'), 'wb'))
[docs] def load_all(self, net_dir):
""" Loads the model in net_dir including all subnets and datasets
using pickle
"""
with open(os.path.join(net_dir,'subnets'),'rb') as file:
self.subnets = pickle.load(file)
self.restore_model(os.path.join(net_dir,'model'))
self.masks = pickle.load(open(os.path.join(net_dir,'supp'), 'rb'))['mask']
[docs]class Subnet():
""" Subnetwork that is associated with one Atom
"""
seed = 42
def __init__(self):
self.species = None
self.n_copies = 0
self.rad_param = None
self.ang_param = None
self.scaler = None
self.X_train = None
self.X_test = None
self.y_train = None
self.y_test = None
self.name = None
self.constructor = fc_nn_g
self.logits_name = None
self.x_name = None
self.y_name = None
self.layers = [8] * 3
self.targets = 1
self.activations = [tf.nn.sigmoid] * 3
def __add__(self, other):
if not isinstance(other,Subnet):
raise Exception("Incompatible data types")
else:
return Energy_Network([[self,other]])
def __mod__(self, other):
if not isinstance(other,Subnet):
raise Exception("Incompatible data types")
else:
return Energy_Network([[self],[other]])
[docs] def get_feed(self, which, train_valid_split = 0.8, seed = None):
""" Return a dictionary that can be used as a feed_dict in tensorflow
Parameters
-----------
which: str,
{'train', 'valid', 'test'}
which part of the dataset is used
train_valid_split: float
ratio of train and validation set size
seed: int
seed parameter for the random shuffle algorithm
Returns
--------
dict
"""
if seed == None:
seed = Subnet.seed
if train_valid_split == 1.0:
shuffle = False
else:
shuffle = True
if which == 'train' or which == 'valid':
X_train = np.concatenate([self.X_train[i] for i in range(self.n_copies)],
axis = 1)
X_train, X_valid, y_train, y_valid = \
train_test_split(X_train,self.y_train,
test_size = 1 - train_valid_split,
random_state = seed, shuffle = shuffle)
X_train, X_valid = reshape_group(X_train, self.n_copies) , \
reshape_group(X_valid, self.n_copies)
if which == 'train':
return {self.x_name : X_train, self.y_name: y_train}
else:
return {self.x_name : X_valid, self.y_name : y_valid}
elif which == 'test':
return {self.x_name : self.X_test, self.y_name: self.y_test}
[docs] def get_logits(self, i):
""" Builds the subnetwork by defining logits and placeholders
Parameters
-----------
i: int
index to label datasets
Returns
---------
tensorflow tensors
"""
with tf.variable_scope(self.name) as scope:
try:
logits,x,y_ = self.constructor(self, i, np.mean(self.targets), np.std(self.targets))
except ValueError:
scope.reuse_variables()
logits,x,y_ = self.constructor(self, i, np.mean(self.targets), np.std(self.targets))
self.logits_name = logits.name
self.x_name = x.name
self.y_name = y_.name
return logits, x, y_
[docs] def save(self, path):
""" Use pickle to save the subnet to path
"""
with open(path,'wb') as file:
pickle.dump(self,file)
[docs] def load(self, path):
""" Load subnet from path
"""
with open(path, 'rb') as file:
self = pickle.load(file)
[docs] def add_dataset(self, dataset, targets,
test_size = 0.2, target_filter = None, scale = True):
""" Adds dataset to the subnetwork.
Parameters
-----------
dataset: dataset
contains datasets that will be associated with subnetwork for training and
evaluation
targets: np.ndarray
target values for training and evaluation
Returns
--------
None
"""
if self.species != None:
if self.species != dataset.species:
raise Exception("Dataset species does not equal subnet species")
else:
self.species = dataset.species
if not self.n_copies == 0:
if self.n_copies != dataset.data.shape[1]:
raise Exception("New dataset incompatible with contained one.")
self.n_copies = dataset.data.shape[1]
self.name = dataset.species
if not test_size == 0.0:
X_train, X_test, y_train, y_test = \
train_test_split(dataset.data, targets,
test_size= test_size, random_state = Subnet.seed, shuffle = True)
else:
X_train = dataset.data
y_train = targets
X_test = np.array(X_train)
y_test = np.array(y_train)
if scale:
if self.scaler == None:
scaler = StandardScaler()
# scaler = MinMaxScaler(feature_range=(-2,2))
scaler.fit(X_train.reshape(-1, X_train.shape[2]))
else:
scaler = self.scaler
X_train = scaler.transform(X_train.reshape(-1,
X_train.shape[2])).reshape(X_train.shape)
X_test = scaler.transform(X_test.reshape(-1,
X_test.shape[2])).reshape(X_test.shape)
self.scaler = scaler
self.X_train = X_train.swapaxes(0,1)
self.X_test = X_test.swapaxes(0,1)
self.features = X_train.shape[2]
if y_train.ndim == 1:
self.y_train = y_train.reshape(-1,1)
self.y_test = y_test.reshape(-1,1)
else:
self.y_train = y_train
self.y_test = y_test
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)
[docs]def load_energy_model(path):
""" Load energy MLCF
Parameters
---------
path: str
directory in which energy MLCF is stored
Returns
-------
Energy_Network
"""
network = Energy_Network([])
network.load_all(path)
return network
[docs]def build_energy_mlcf(feature_src, target_src, masks = {}, automask_std = 0,
filters = [], autofilt_percent = 0, test_size = 0.2):
""" Return a trainable energy MLCF (neural network)
Parameters
----------
feature_src: list
list of paths to the hdf5 containing the features
target_src: list
list of paths to the csv files containing the target energies
entries in target_scr and feature_src correspond to each other
masks: dict,
containing list booleans; can be used to select which features to use.
keys specify the atomic species.
default: use all features
automask_std: float,
if mask not set exclude all features whose stdev across dataset
is smaller than this value
filters: list,
containing list of booleans; can be used to exclude datapoints
in sets (e.g. outliers)
autofilt_percent: float,
exclude this percentile of extreme datapoints from set
(only if filters not set)
test_size: float,
relative size of hold_out (test) set
Returns
-------
Energy_Network
"""
if not len(feature_src) == len(target_src):
raise Exception('Please provided only one target location for each feature set')
sets = []
if len(filters) != len(feature_src):
filters = [0]*len(feature_src)
no_mask = False
for fsrc, tsrc, filter in zip(feature_src, target_src, filters):
elfs, _ = elf.utils.hdf5_to_elfs_fast(fsrc)
targets = np.genfromtxt(tsrc, delimiter = ',')
if not isinstance(filter, list) and not isinstance(filter, np.ndarray):
percentile_cutoff = autofilt_percent
lim1 = np.percentile(targets, percentile_cutoff*100)
lim2 = np.percentile(targets, (1 - percentile_cutoff)*100)
min_lim, max_lim = min(lim1,lim2), max(lim1,lim2)
filter = (targets > min_lim) & (targets < max_lim)
if len(masks) != len(elfs):
no_mask = True
for species in elfs:
feat = np.array(elfs[species])
masks[species] = (np.std(feat.reshape(-1,feat.shape[-1]),
axis = 0) > automask_std)
targets = targets[filter]
subnets = []
for species in elfs:
feat = np.array(elfs[species])[:,:,masks[species]]
feat = feat[filter]
for j in range(feat.shape[1]):
subnets.append(Subnet())
subnets[-1].add_dataset(Dataset(feat[:,j:j+1], species),
targets, test_size = 0.2)
sets.append(subnets)
network = Energy_Network(sets)
network.masks = masks
return network
[docs]def get_energy_filters(target_src, autofilt_percent = 0):
""" For a given energy target dataset return filter that cutoff the
upper and lower percentile specified in autofilt_percent
Parameters
----------
target_src: str
path of csv file containing energy targets
autofilt_percent: float
percentile to cut off
Returns
--------
list of bool
filters
"""
filters = []
for tsrc in target_src:
targets = np.genfromtxt(tsrc, delimiter = ',')
percentile_cutoff = autofilt_percent
lim1 = np.percentile(targets, percentile_cutoff*100)
lim2 = np.percentile(targets, (1 - percentile_cutoff)*100)
min_lim, max_lim = min(lim1,lim2), max(lim1,lim2)
filter = (targets > min_lim) & (targets < max_lim)
filters.append(filter)
return filters