Source code for mlc_func.ml.network

""" Module that implements Energy_Network, the machine learned correcting functional (MLCF)
for energies
"""

import numpy as np
import tensorflow as tf
import pandas as pd
import os
import sys
from sklearn.model_selection import train_test_split
from .ml_util import *
try:
    from .preprocessing import *
except ImportError:
    print('Preprocessing module not loaded')
from matplotlib import pyplot as plt
import math
import pickle
from collections import namedtuple
import h5py
import json
from ase.io import read
import mlc_func.elf as elf
Dataset = namedtuple("Dataset", "data species")


[docs]class Energy_Network():
    """ Machine learned correcting functional (MLCF) for energies

        Parameters
        ----------

            subnets: list of Subnetwork
                each subnetwork belongs to a single atom inside the system
                and computes the atomic contributio to the total energy
    """
    def __init__(self, subnets):


        if not isinstance(subnets, list):
            self.subnets = [subnets]
        else:
            self.subnets = subnets


        self.model_loaded = False
        self.rand_state = np.random.get_state()
        self.graph = None
        self.target_mean = 0
        self.target_std = 1
        self.sess = None
        self.graph = None
        self.initialized = False
        self.optimizer = None
        self.checkpoint_path = None
        self.masks = {}
        self.species_nets = {}
        self.species_nets_names = {}
        self.species_gradients_names = {}
        scale_together(subnets)

    # ========= Network operations ============ #

    def __add__(self, other):
        if isinstance(other, Subnet):
            if not len(self.subnets) == 1:
                raise Exception(" + operator only valid if only one training set contained")
            else:
                self.subnets[0] += [other]
        else:
            raise Exception("Datatypes not compatible")

        return self

    def __mod__(self, other):
        if isinstance(other, Subnet):
            self.subnets += [[other]]
        elif isinstance(other, Energy_Network):
            self.subnets += other.subnets
        else:
            raise Exception("Datatypes not compatible")

        return self

    def reset(self):
        self.sess = None
        self.graph = None
        self.initialized = False
        self.optimizer = None
        self.checkpoint_path = None


[docs]    def construct_network(self):
        """ Builds the tensorflow graph from subnets
        """

        cnt = 0
        logits = []
        for subnet in self.subnets:
            if isinstance(subnet,list):
                sublist = []
                for s in subnet:
                    sublist.append(s.get_logits(cnt)[0])
                    cnt += 1
                logits.append(sublist)
            else:
                logits.append(subnet.get_logits(cnt)[0])
                cnt += 1

        return logits

    def sample_training(self, percentage):


        for subnet in self.subnets:
            n_samples = len(subnet[0].y_train)
            n_select = np.ceil(percentage*n_samples).astype(int)
            if n_select < 2: n_select = 2
            selection = np.arange(n_samples).astype(int)
            np.random.shuffle(selection)
            selection = selection[:n_select]
            for s in subnet:
                s.X_train = s.X_train[:,selection]
                s.y_train = s.y_train[selection]



[docs]    def get_feed(self, which = 'train', train_valid_split = 0.8, seed = 42):
        """ Return a dictionary that can be used as a feed_dict in tensorflow

        Parameters
        -----------
            which: {'train',test'}
                which part of the dataset is used
            train_valid_split: float
                ratio of train and validation set size
            seed: int
                seed parameter for the random shuffle algorithm, make

        Returns
        --------
            (dictionary, dictionary)
                either (training feed dictionary, validation feed dict.)
                or (testing feed dictionary, None)
        """
        train_feed_dict = {}
        valid_feed_dict = {}
        test_feed_dict = {}

        for subnet in self.subnets:
            if isinstance(subnet,list):
                for s in subnet:
                    train_feed_dict.update(s.get_feed('train', train_valid_split, seed))
                    valid_feed_dict.update(s.get_feed('valid', train_valid_split, seed))
                    test_feed_dict.update(s.get_feed('test', train_valid_split, seed))
            else:
                train_feed_dict.update(subnet.get_feed('train', train_valid_split, seed))
                valid_feed_dict.update(subnet.get_feed('valid', train_valid_split, seed))
                test_feed_dict.update(subnet.get_feed('test', seed = seed))

        if which == 'train':
            return train_feed_dict, valid_feed_dict
        elif which == 'test':
            return test_feed_dict, None


[docs]    def get_cost(self):
        """ Build the tensorflow node that defines the cost function

        Returns
        -------
            cost_list: [tensorflow.placeholder]
                list of costs for subnets. subnets
                whose outputs are added together share cost functions
        """
        cost_list = []

        for subnet in self.subnets:
            if isinstance(subnet,list):
                cost = 0
                y_ = self.graph.get_tensor_by_name(subnet[0].y_name)
                log = 0
                for s in subnet:
                    log += self.graph.get_tensor_by_name(s.logits_name)
                cost += tf.reduce_mean(tf.reduce_mean(tf.square(y_-log),0))
            else:
                log = self.graph.get_tensor_by_name(subnet.logits_name)
                y_ = self.graph.get_tensor_by_name(subnet.y_name)
                cost = tf.reduce_mean(tf.reduce_mean(tf.square(y_-log),0))
            cost_list.append(cost)

        return cost_list




[docs]    def train(self,
              step_size=0.01,
              max_steps=50001,
              b_=0,
              verbose=True,
              optimizer=None,
              adaptive_rate=False,
              multiplier = 1.0):

        """ Train the master neural network

            Parameters
            ----------
                step_size: float
                    step size for gradient descent
                max_steps: int
                    number of training epochs
                b: float
                    regularization parameter
                verbose: boolean
                    print cost for intermediate training epochs
                optimizer: tf.nn.GradientDescentOptimizer,tf.nn.AdamOptimizer, ...
                adaptive_rate: boolean
                    wether to adjust step_size if cost increases
                                not recommended for AdamOptimizer
                multiplier: list of float
                    multiplier that allow to give datasets more
                    weight than others

            Returns
            --------
            None
        """


        self.model_loaded = True
        if self.graph is None:
            self.graph = tf.Graph()
            build_graph = True
        else:
            build_graph = False

        with self.graph.as_default():

            if self.sess == None:
                sess = tf.Session()
                self.sess = sess
            else:
                sess = self.sess



            # Get number of distinct subnet species
            species = {}
            for net in self.subnets:
                if isinstance(net,list):
                    for net in net:
                        for l,_ in enumerate(net.layers):
                            name = net.species
                            species[name] = 1
                else:
                    for l,_ in enumerate(net.layers):
                        name = net.species
                        species[name] = 1
            n_species = len(species)


            # Build all the required tensors
            b = {}
            if build_graph:
                self.construct_network()
                for s in species:
                    b[s] = tf.placeholder(tf.float32,name = '{}/b'.format(s))
            else:
                for s in species:
                    b[s] = self.graph.get_tensor_by_name('{}/b:0'.format(s))

            cost_list = self.get_cost()
            train_feed_dict, valid_feed_dict = self.get_feed('train')
            cost = 0
            if not isinstance(multiplier, list):
                multiplier = [1.0]*len(cost_list)
            print('multipliers: {}'.format(multiplier))
            for c, m in zip(cost_list, multiplier):
                cost += c*m

            # L2-loss
            loss = 0
            with tf.variable_scope("", reuse=True):
                for net in self.subnets:
                    if isinstance(net,list):
                        for net in net:
                            for l, layer in enumerate(net.layers):
                                name = net.species
                                loss += tf.nn.l2_loss(tf.get_variable("{}/W{}".format(name, l+1))) * \
                                        b[name]/layer
                    else:
                        for l, layer in enumerate(net.layers):
                            name = net.species
                            loss += tf.nn.l2_loss(tf.get_variable("{}/W{}".format(name, l+1))) * \
                                b[name]/layer

            cost += loss

            for i, s in enumerate(species):
                train_feed_dict['{}/b:0'.format(s)] = b_[i]
                valid_feed_dict['{}/b:0'.format(s)] = 0


            if self.optimizer == None:
                if optimizer == None:
                    self.optimizer = tf.train.AdamOptimizer(learning_rate = step_size)
                else:
                    self.optimizer = optimizer

            train_step = self.optimizer.minimize(cost)

            # Workaround to load the AdamOptimizer variables
            if not self.checkpoint_path == None:
                saver = tf.train.Saver()
                saver.restore(self.sess,self.checkpoint_path)
                self.checkpoint_path = None

            initialize_uninitialized(self.sess)

            self.initialized = True

            train_writer = tf.summary.FileWriter('./log/',
                                      self.graph)
            old_cost = 1e8

            for _ in range(0,max_steps):

                sess.run(train_step,feed_dict=train_feed_dict)

                if _%int(max_steps/100) == 0 and adaptive_rate == True:
                    new_cost = sess.run(tf.sqrt(cost),
                        feed_dict=train_feed_dict)

                    if new_cost > old_cost:
                        step_size /= 2
                        print('Step size decreased to {}'.format(step_size))
                        train_step = tf.train.GradientDescentOptimizer(step_size).minimize(cost)
                    old_cost = new_cost

                if _%int(max_steps/10) == 0 and verbose:
                    print('Step: ' + str(_))
                    print('Training set loss:')
                    if len(cost_list) > 1:
                        for i, c in enumerate(cost_list):
                            print('{}: {}'.format(i,sess.run(tf.sqrt(c),feed_dict=train_feed_dict)))
                    print('Total: {}'.format(sess.run(tf.sqrt(cost-loss),feed_dict=train_feed_dict)))
                    print('Validation set loss:')
                    if len(cost_list) > 1:
                        for i, c in enumerate(cost_list):
                            print('{}: {}'.format(i,sess.run(tf.sqrt(c),feed_dict=valid_feed_dict)))
                    print('Total: {}'.format(sess.run(tf.sqrt(cost),feed_dict=valid_feed_dict)))
                    print('--------------------')
                    print('L2-loss: {}'.format(sess.run(loss,feed_dict=train_feed_dict)))

[docs]    def predict(self, features, species, use_masks = False, return_gradient = False):
        """ Get predicted energies

        Parameters
        ----------
        features: np.ndarray
            input features
        species: str
            predict atomic contribution to energy for this species
        use_masks: bool
             whether masks should be applied to the provided features
        return_gradient: bool
            instead of returning energies, return gradient of network
            w.r.t. input features

        Returns
        -------
        np.ndarray
            predicted energies or gradient

        """
        species = species.lower()
        if features.ndim == 2:
            features = features.reshape(-1,1,features.shape[1])
        else:
            raise Exception('features.ndim != 2')
        if use_masks:
            features = features[:,:,self.masks[species]]

        ds = Dataset(features, species)
        targets = np.zeros(features.shape[0])

        if not species in self.species_nets:
            self.species_nets[species] = Subnet()
            found = False

            snet = self.species_nets[species]
            for s in self.subnets:
                if found == True:
                    break
                if isinstance(s,list):
                    for s2 in s:
                        if s2.species == ds.species:
                            snet.scaler = s2.scaler
                            snet.layers = s2.layers
                            snet.targets = s2.targets
                            snet.activations = s2.activations
                            print("Sharing scaler with species " + s2.species)
                            found = True
                            break
                else:
                    if s.species == ds.species:
                        snet.scaler = s.scaler
                        snet.layers = s.layers
                        snet.targets = s.targets
                        snet.activations = s.activations
                        print("Sharing scaler with species " + s.species)
                        break

        snet = self.species_nets[species]
        snet.add_dataset(ds, targets, test_size=0.0)
        if not self.model_loaded:
            raise Exception('Model not loaded!')
        else:
            with self.graph.as_default():
                if species in self.species_nets_names:
                    logits = self.graph.get_tensor_by_name(self.species_nets_names[species])
                    gradients = self.graph.get_tensor_by_name(self.species_gradients_names[species])
                else:
                    logits, x, _ = snet.get_logits(1)
                    gradients = tf.gradients(logits, x)[0].values
                    self.species_nets_names[species] = logits.name
                    self.species_gradients_names[species] = gradients.name
                sess = self.sess
                energies = sess.run(logits, feed_dict=snet.get_feed(which='train',
                     train_valid_split=1.0))
                if return_gradient:
                    grad = sess.run(gradients, feed_dict=snet.get_feed(which='train',
                     train_valid_split=1.0))[0]
                    grad = grad/np.sqrt(snet.scaler.var_).reshape(1,-1)
                    energies = (energies, grad)

                return energies

    def get_weights(self, species, subnet):
        W = []
        b = []
        with self.graph.as_default():
            with tf.variable_scope("", reuse=True):
                for l, layer in enumerate(subnet.layers + ['']):
                    W.append(self.sess.run(tf.get_variable("{}/W{}".format(species, l + 1))))
                    b.append(self.sess.run(tf.get_variable("{}/b{}".format(species, l + 1))))
        return W, b
[docs]    def get_energies(self, summarize = True, which = 'train'):
        """ Uses trained model on training or test sets

        Parameters
        -----------
        which: str
            {'train','test'} which set logits are computed for

        Returns
        --------
         list of numpy.ndarray
            resulting energies grouped by independent subnet datasets
        """

        if not self.model_loaded:
            raise Exception('Model not loaded!')
        else:
            with self.graph.as_default():

                logits_list = self.construct_network()

                sess = self.sess
                feed_dict, _ = self.get_feed(train_valid_split = 1.0, which = which)

                return_list = []

                for logits in logits_list:
                    if isinstance(logits,list):
                        result = 0
                        for logits in logits:
                            if summarize:
                                result += sess.run(logits,feed_dict=feed_dict)
                            else:
                                return_list.append(sess.run(logits,feed_dict=feed_dict))
                        if summarize:
                            return_list.append(result)
                    else:
                        return_list.append(sess.run(logits,feed_dict=feed_dict))
                return return_list

[docs]    def save_model(self, path):
        """ Save trained model to path
        """

        if path[-5:] == '.ckpt':
            path = path[:-5]

        with self.graph.as_default():
            sess = self.sess
            saver = tf.train.Saver()
            saver.save(sess,save_path = path + '.ckpt')

[docs]    def restore_model(self, path):
        """ Load trained model from path
        """

        if path[-5:] == '.ckpt':
            path = path[:-5]

        self.checkpoint_path = path + '.ckpt'
        g = tf.Graph()
        with g.as_default():
            sess = tf.Session()
            self.construct_network()
            b = tf.placeholder(tf.float32,name = 'b')
            saver = tf.train.Saver()
            saver.restore(sess,path + '.ckpt')
            self.model_loaded = True
            self.sess = sess
            self.graph = g
            self.initialized = True

[docs]    def save_all(self, net_dir, override = False):
        """ Saves the model including all subnets and datasets
        using pickle to directory net_dir, if directory exists only
        save if override = True
        """
        try:
            os.mkdir(net_dir)
        except FileExistsError:
            if override:
                print('Overriding...')
                import shutil
                shutil.rmtree(net_dir)
                os.mkdir(net_dir)
            else:
                print('Directory/Network already exists. Network not saved...')
                return None

        # Pickle does not seem to be compatible with tensorflow so just
        # save subnetworks with it
        with open(os.path.join(net_dir,'subnets'),'wb') as file:
            pickle.dump(self.subnets,file)

        to_save = {'mask': self.masks}

        self.save_model(os.path.join(net_dir,'model'))

        pickle.dump(to_save, open(os.path.join(net_dir,'supp'), 'wb'))


[docs]    def load_all(self, net_dir):
        """ Loads the model in net_dir including all subnets and datasets
        using pickle
        """

        with open(os.path.join(net_dir,'subnets'),'rb') as file:
            self.subnets = pickle.load(file)

        self.restore_model(os.path.join(net_dir,'model'))
        self.masks = pickle.load(open(os.path.join(net_dir,'supp'), 'rb'))['mask']

[docs]class Subnet():
    """ Subnetwork that is associated with one Atom
    """

    seed = 42

    def __init__(self):
        self.species = None
        self.n_copies = 0
        self.rad_param = None
        self.ang_param = None
        self.scaler = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.name = None
        self.constructor = fc_nn_g
        self.logits_name = None
        self.x_name = None
        self.y_name = None
        self.layers = [8] * 3
        self.targets = 1
        self.activations = [tf.nn.sigmoid] * 3

    def __add__(self, other):
        if not isinstance(other,Subnet):
            raise Exception("Incompatible data types")
        else:
            return Energy_Network([[self,other]])

    def __mod__(self, other):
        if not isinstance(other,Subnet):
            raise Exception("Incompatible data types")
        else:
            return Energy_Network([[self],[other]])


[docs]    def get_feed(self, which, train_valid_split = 0.8, seed = None):
        """ Return a dictionary that can be used as a feed_dict in tensorflow

        Parameters
        -----------
            which: str,
                {'train', 'valid', 'test'}
                which part of the dataset is used
            train_valid_split: float
                ratio of train and validation set size
            seed: int
                seed parameter for the random shuffle algorithm

        Returns
        --------
            dict
        """
        if seed == None:
            seed = Subnet.seed

        if train_valid_split == 1.0:
            shuffle = False
        else:
            shuffle = True


        if which == 'train' or which == 'valid':

            X_train = np.concatenate([self.X_train[i] for i in range(self.n_copies)],
                axis = 1)

            X_train, X_valid, y_train, y_valid = \
                train_test_split(X_train,self.y_train,
                                 test_size = 1 - train_valid_split,
                                 random_state = seed, shuffle = shuffle)
            X_train, X_valid = reshape_group(X_train, self.n_copies) , \
                               reshape_group(X_valid, self.n_copies)

            if which == 'train':
                return {self.x_name : X_train, self.y_name: y_train}
            else:
                return {self.x_name : X_valid, self.y_name : y_valid}

        elif which == 'test':

            return {self.x_name : self.X_test, self.y_name: self.y_test}



[docs]    def get_logits(self, i):
        """ Builds the subnetwork by defining logits and placeholders

        Parameters
        -----------
            i: int
                index to label datasets

        Returns
        ---------
            tensorflow tensors
        """

        with tf.variable_scope(self.name) as scope:
                        try:
                            logits,x,y_ = self.constructor(self, i, np.mean(self.targets), np.std(self.targets))
                        except ValueError:
                            scope.reuse_variables()
                            logits,x,y_ = self.constructor(self, i, np.mean(self.targets), np.std(self.targets))

        self.logits_name = logits.name
        self.x_name = x.name
        self.y_name = y_.name
        return logits, x, y_

[docs]    def save(self, path):
        """ Use pickle to save the subnet to path
        """

        with open(path,'wb') as file:
            pickle.dump(self,file)

[docs]    def load(self, path):
        """ Load subnet from path
        """

        with open(path, 'rb') as file:
            self = pickle.load(file)

[docs]    def add_dataset(self, dataset, targets,
        test_size = 0.2, target_filter = None, scale = True):
        """ Adds dataset to the subnetwork.

            Parameters
            -----------
                dataset: dataset
                    contains datasets that will be associated with subnetwork for training and
                    evaluation
                targets: np.ndarray
                    target values for training and evaluation

            Returns
            --------
                None
        """

        if self.species != None:
            if self.species != dataset.species:
                raise Exception("Dataset species does not equal subnet species")
        else:
            self.species = dataset.species

        if not self.n_copies == 0:
            if self.n_copies != dataset.data.shape[1]:
                raise Exception("New dataset incompatible with contained one.")

        self.n_copies = dataset.data.shape[1]
        self.name = dataset.species


        if not test_size == 0.0:
            X_train, X_test, y_train, y_test = \
                train_test_split(dataset.data, targets,
                    test_size= test_size, random_state = Subnet.seed, shuffle = True)
        else:
            X_train = dataset.data
            y_train = targets
            X_test = np.array(X_train)
            y_test = np.array(y_train)

        if scale:
            if self.scaler == None:
                scaler = StandardScaler()
                # scaler = MinMaxScaler(feature_range=(-2,2))
                scaler.fit(X_train.reshape(-1, X_train.shape[2]))
            else:
                scaler = self.scaler

            X_train = scaler.transform(X_train.reshape(-1,
                X_train.shape[2])).reshape(X_train.shape)
            X_test = scaler.transform(X_test.reshape(-1,
                X_test.shape[2])).reshape(X_test.shape)

            self.scaler = scaler

        self.X_train = X_train.swapaxes(0,1)
        self.X_test = X_test.swapaxes(0,1)
        self.features = X_train.shape[2]

        if y_train.ndim == 1:
            self.y_train = y_train.reshape(-1,1)
            self.y_test = y_test.reshape(-1,1)
        else:
            self.y_train = y_train
            self.y_test = y_test

        assert len(X_train) == len(y_train)
        assert len(X_test) == len(y_test)


[docs]def load_energy_model(path):
    """ Load energy MLCF

        Parameters
        ---------
            path: str
                directory in which energy MLCF is stored

        Returns
        -------
            Energy_Network
    """
    network = Energy_Network([])
    network.load_all(path)
    return network

[docs]def build_energy_mlcf(feature_src, target_src, masks = {}, automask_std = 0,
    filters = [], autofilt_percent = 0, test_size = 0.2):

    """ Return a trainable energy MLCF (neural network)

    Parameters
    ----------

        feature_src: list
            list of paths to the hdf5 containing the features
        target_src: list
            list of paths to the csv files containing the target energies
            entries in target_scr and feature_src correspond to each other
        masks: dict,
            containing list booleans; can be used to select which features to use.
            keys specify the atomic species.
            default: use all features

        automask_std: float,
            if mask not set exclude all features whose stdev across dataset
            is smaller than this value

        filters: list,
            containing list of booleans; can be used to exclude datapoints
            in sets (e.g. outliers)

        autofilt_percent: float,
            exclude this percentile of extreme datapoints from set
            (only if filters not set)
        test_size: float,
            relative size of hold_out (test) set
    Returns
    -------

        Energy_Network

    """

    if not len(feature_src) == len(target_src):
        raise Exception('Please provided only one target location for each feature set')

    sets = []
    if len(filters) != len(feature_src):
        filters = [0]*len(feature_src)

    no_mask = False

    for fsrc, tsrc, filter in zip(feature_src, target_src, filters):

        elfs, _ = elf.utils.hdf5_to_elfs_fast(fsrc)

        targets = np.genfromtxt(tsrc, delimiter = ',')

        if not isinstance(filter, list) and not isinstance(filter, np.ndarray):

            percentile_cutoff = autofilt_percent
            lim1 = np.percentile(targets, percentile_cutoff*100)
            lim2 = np.percentile(targets, (1 - percentile_cutoff)*100)
            min_lim, max_lim = min(lim1,lim2), max(lim1,lim2)
            filter = (targets > min_lim) & (targets < max_lim)

        if len(masks) != len(elfs):
            no_mask = True
            for species in elfs:
                feat = np.array(elfs[species])
                masks[species] = (np.std(feat.reshape(-1,feat.shape[-1]),
                        axis = 0) > automask_std)

        targets = targets[filter]
        subnets = []
        for species in elfs:
            feat = np.array(elfs[species])[:,:,masks[species]]
            feat = feat[filter]
            for j in range(feat.shape[1]):
                subnets.append(Subnet())
                subnets[-1].add_dataset(Dataset(feat[:,j:j+1], species),
                    targets, test_size = 0.2)

        sets.append(subnets)
    network = Energy_Network(sets)
    network.masks = masks
    return network

[docs]def get_energy_filters(target_src, autofilt_percent = 0):
    """ For a given energy target dataset return filter that cutoff the
    upper and lower percentile specified in autofilt_percent

    Parameters
    ----------
    target_src: str
        path of csv file containing energy targets
    autofilt_percent: float
        percentile to cut off

    Returns
    --------
        list of bool
            filters
    """
    filters = []
    for tsrc in target_src:
        targets = np.genfromtxt(tsrc, delimiter = ',')
        percentile_cutoff = autofilt_percent
        lim1 = np.percentile(targets, percentile_cutoff*100)
        lim2 = np.percentile(targets, (1 - percentile_cutoff)*100)
        min_lim, max_lim = min(lim1,lim2), max(lim1,lim2)
        filter = (targets > min_lim) & (targets < max_lim)
        filters.append(filter)
    return filters