#StackBounty: #python #pandas #numpy #tensorflow Tensorflow TypeError: Can't convert 'numpy.int64' object to str implicitly

Bounty: 150

Here is my jupyter notebook :

import pandas as pd
from pprint import pprint
import pickle 
import numpy as np

with open('preDF.p', 'rb') as f:
    preDF = pickle.load(f)
#pprint(preDF)
df = pd.DataFrame(data=preDF)
#df.rename(columns={166: '166'}, inplace=True)
df.head()
0 1   2   3   4   5   6   7   8   9   ... 157 158 159 160 161 162 163 164 165 166
0 3   8   1   13  15  13  9   12  12  1   ... 0   0   0   0   0   0   0   0   0   1
1 3   1   13  15  13  9   12  12  1   27  ... 0   0   0   0   0   0   0   0   0   1
2 3   8   1   13  15  13  9   12  12  1   ... 0   0   0   0   0   0   0   0   0   1
3 13  5   20  18  9   3   1   18  9   1   ... 0   0   0   0   0   0   0   0   0   1
4 3   8   12  15  18  8   5   24  9   4   ... 0   0   0   0   0   0   0   0   0   2
5 rows × 167 columns
import numpy as np 
#msk = np.random.rand(len(df)) < 0.8
#train = df[msk]
#test = df[~msk]

from sklearn.model_selection import KFold
kf = KFold(n_splits=2)
train = df.iloc[train_index]
test = df.iloc[test_index]
train.columns = train.columns.astype(np.int32)
test.columns = test.columns.astype(np.int32)


import tensorflow as tf

def train_input_fn(features, labels, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features.astype(np.int32)), labels.astype(np.int32)))

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset


def eval_input_fn(features, labels, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features.astype(np.int32))
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset

def load_data(train,test,y_name=166):

    train_x, train_y = train, train.pop(y_name)

    test_x, test_y = test, test.pop(y_name)

    return (train_x, train_y), (test_x, test_y)

def main(train,test):
    batch_size = np.int32(100)
    train_steps = np.int32(1000)
    # Fetch the data

    SPECIES = ['neg', 'stable', 'pos']
    (train_x, train_y), (test_x, test_y) = load_data(train,test)

    # Feature columns describe how to use the input.
    my_feature_columns = []
    for key in train_x.keys():
        my_feature_columns.append(tf.feature_column.numeric_column(key=key))

    # Build 2 hidden layer DNN with 10, 10 units respectively.
    classifier = tf.estimator.DNNClassifier(
        feature_columns=my_feature_columns,
        # Two hidden layers of 10 nodes each.
        hidden_units=[30, 10,30],
        # The model must choose between 3 classes.
        n_classes=3)

    classifier.train(
        input_fn=lambda:train_input_fn(train_x, train_y,
                                                 batch_size),
        steps=train_steps)
    # Evaluate the model.
    eval_result = classifier.evaluate(
        input_fn=lambda:eval_input_fn(test_x, test_y,
                                                batch_size))

    print('nTest set accuracy: {accuracy:0.3f}n'.format(**eval_result))

    # Generate predictions from the model
    expected = ['exp neg', 'exp stable', 'exp pos']
    predict_x = {
        'open': [5.1, 5.9, 6.9],
        'high': [3.3, 3.0, 3.1],
        'low':   [1.7, 4.2, 5.4],
        'close': [0.5, 1.5, 2.1],
    }

    predictions = classifier.predict(
        input_fn=lambda:eval_input_fn(predict_x,
                                                labels=None,
                                                batch_size=batch_size))

    template = ('nPrediction is "{}" ({:.1f}%), expected "{}"')

    for pred_dict, expec in zip(predictions, expected):
        class_id = pred_dict['class_ids'][0]
        probability = pred_dict['probabilities'][class_id]

        print(template.format(SPECIES[class_id],
                              100 * probability, expec))


if __name__ == '__main__':
    #tf.logging.set_verbosity(tf.logging.INFO)
    tf.app.run(main(train,test))

So I get this error :

INFO:tensorflow:Using default config.
WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmpz7rw1puj
INFO:tensorflow:Using config: {'_task_type': 'worker', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f478ba9bdd8>, '_tf_random_seed': None, '_keep_checkpoint_max': 5, '_is_chief': True, '_master': '', '_session_config': None, '_log_step_count_steps': 100, '_global_id_in_cluster': 0, '_evaluation_master': '', '_service': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_task_id': 0, '_num_worker_replicas': 1, '_model_dir': '/tmp/tmpz7rw1puj', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000}
INFO:tensorflow:Calling model_fn.
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-141-fcd417d2c3ff> in <module>()
     98 if __name__ == '__main__':
     99     #tf.logging.set_verbosity(tf.logging.INFO)
--> 100     tf.app.run(main(train,test))

<ipython-input-141-fcd417d2c3ff> in main(train, test)
     64         input_fn=lambda:train_input_fn(train_x, train_y,
     65                                                  batch_size),
---> 66         steps=train_steps)
     67     # Evaluate the model.
     68     eval_result = classifier.evaluate(

/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
    350 
    351     saving_listeners = _check_listeners_type(saving_listeners)
--> 352     loss = self._train_model(input_fn, hooks, saving_listeners)
    353     logging.info('Loss for final step: %s.', loss)
    354     return self

/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
    810       worker_hooks.extend(input_hooks)
    811       estimator_spec = self._call_model_fn(
--> 812           features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
    813 
    814       if self._warm_start_settings:

/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py in _call_model_fn(self, features, labels, mode, config)
    791 
    792     logging.info('Calling model_fn.')
--> 793     model_fn_results = self._model_fn(features=features, **kwargs)
    794     logging.info('Done calling model_fn.')
    795 

/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/canned/dnn.py in _model_fn(features, labels, mode, config)
    352           dropout=dropout,
    353           input_layer_partitioner=input_layer_partitioner,
--> 354           config=config)
    355 
    356     super(DNNClassifier, self).__init__(

/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/canned/dnn.py in _dnn_model_fn(features, labels, mode, head, hidden_units, feature_columns, optimizer, activation_fn, dropout, input_layer_partitioner, config)
    183         dropout=dropout,
    184         input_layer_partitioner=input_layer_partitioner)
--> 185     logits = logit_fn(features=features, mode=mode)
    186 
    187     def _train_op_fn(loss):

/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/canned/dnn.py in dnn_logit_fn(features, mode)
     89         partitioner=input_layer_partitioner):
     90       net = feature_column_lib.input_layer(
---> 91           features=features, feature_columns=feature_columns)
     92     for layer_id, num_hidden_units in enumerate(hidden_units):
     93       with variable_scope.variable_scope(

/usr/local/lib/python3.5/dist-packages/tensorflow/python/feature_column/feature_column.py in input_layer(features, feature_columns, weight_collections, trainable, cols_to_vars)
    271   """
    272   return _internal_input_layer(features, feature_columns, weight_collections,
--> 273                                trainable, cols_to_vars)
    274 
    275 

/usr/local/lib/python3.5/dist-packages/tensorflow/python/feature_column/feature_column.py in _internal_input_layer(features, feature_columns, weight_collections, trainable, cols_to_vars, scope)
    192       ordered_columns.append(column)
    193       with variable_scope.variable_scope(
--> 194           None, default_name=column._var_scope_name):  # pylint: disable=protected-access
    195         tensor = column._get_dense_tensor(  # pylint: disable=protected-access
    196             builder,

/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py in __enter__(self)
   1901 
   1902     try:
-> 1903       return self._enter_scope_uncached()
   1904     except:
   1905       if self._graph_context_manager is not None:

/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py in _enter_scope_uncached(self)
   2006         raise
   2007       self._current_name_scope = current_name_scope
-> 2008       unique_default_name = _get_unique_variable_scope(self._default_name)
   2009       pure_variable_scope = _pure_variable_scope(
   2010           unique_default_name,

/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py in _get_unique_variable_scope(prefix)
   1690   var_store = _get_default_variable_store()
   1691   current_scope = get_variable_scope()
-> 1692   name = current_scope.name + "/" + prefix if current_scope.name else prefix
   1693   if var_store.variable_scope_count(name) == 0:
   1694     return prefix

TypeError: Can't convert 'numpy.int64' object to str implicitly

My guess is that this worked without calling numpy as a simple example.

now that I’ve called numpy every int is a int64 and it seems that tensorflow try to convert very simply an int to string.

But as it is not so simple to convert an int64 to a string it failed because now all int are by default int64.

But I have some problems to find which int is problematic here.

The notebook is here :
https://www.dropbox.com/s/rx8v5aap3zhoshm/NewML.html?dl=1
and the pickle predf is here :
https://www.dropbox.com/s/wd831906jq3o1jl/preDF.p?dl=1


Get this bounty!!!

#StackBounty: #python #performance #beginner #numpy #markov-chain Solve the phase state between two haplotype blocks using markov trans…

Bounty: 50

I have spent about more than a year in python, but I come from biology background. I should say I have got some understanding of for-loop and nested-for-loop to workout the solution to the problem I have had. But, I suppose that there are better and comprehensive way of solving problem.

Below is the code I wrote to solve the haplotype phase state between two haplotype blocks.

Here is a short snipped of my data:

chr  pos       ms01e_PI  ms01e_PG_al  ms02g_PI  ms02g_PG_al  ms03g_PI  ms03g_PG_al  ms04h_PI  ms04h_PG_al  ms05h_PI  ms05h_PG_al  ms06h_PI  ms06h_PG_al
2    15881764  4         C|T          6         C|T          7         T|T          7         T|T          7         C|T          7         C|T
2    15881767  4         C|C          6         T|C          7         C|C          7         C|C          7         T|C          7         C|C
2    15881989  4         C|C          6         A|C          7         C|C          7         C|C          7         A|T          7         A|C
2    15882091  4         G|T          6         G|T          7         T|A          7         A|A          7         A|T          7         A|C
2    15882451  4         C|T          4         T|C          7         T|T          7         T|T          7         C|T          7         C|A
2    15882454  4         C|T          4         T|C          7         T|T          7         T|T          7         C|T          7         C|T
2    15882493  4         C|T          4         T|C          7         T|T          7         T|T          7         C|T          7         C|T
2    15882505  4         A|T          4         T|A          7         T|T          7         T|T          7         A|C          7         A|T

Problem:

In the given data I want to solve the phase state of haplotype for the sample ms02g. The suffix PI represents the phased block (i.e all data within that block is phased).

For sample ms02g C-T-A-G is phased (PI level 6), so is T-C-C-T. Similarly data within PI level 4 is also phased. But, since the haplotype is broken in two levels, we don’t know which phase from level-6 goes with which phase of level-4. Phase to be solved - sample ms02g

ms02g_PI  ms02g_PG_al
6         C|T
6         T|C
6         A|C
6         G|T
×——————————×—————> Break Point
4         T|C
4         T|C
4         T|C
4         T|A

Since, all other samples are completely phased I can run a markov-chain transition probabilities to solve the phase state. To the human eye/mind you can clearly see and say that left part of ms02g (level 6, i.e C-T-A-G is more likely to go with right block of level 4 C-C-C-A).

Calculation:

Step 01:

I prepare required haplotype configuration:

  • The top haplotype-PI is block01 and bottom is block-02.
  • The left phased haplotype within each block is Hap-A and the right is Hap-B.

So, there are 2 haplotype configurations:

  • Block01-HapA with Block02-HapA, so B01-HapB with B02-HapB
  • Block01-HapA with Block02-HapB, so B01-HapB with B02-HapA

I count the number of transition for each nucleotide of PI-6 to each nucleotide of PI-4 for each haplotype configuration.
possible transition from PI-6 to PI-4

  Possible
  transitions    ms02g_PG_al
    │       ┌┬┬┬ C|T
    │       ││││ T|C
    │       ││││ A|C
    │       ││││ G|T
    └─────> ││││  ×—————> Break Point
            │││└ T|C
            ││└─ T|C
            │└── T|C
            └─── T|A
  • and multiply the transition probabilities for first nucleotide in PI-6 to all nucleotides of PI-4. Then similarly multiply the transition probability for 2nd nucleotide of PI-6 to all nucleotides in PI-4.

  • then I sum the transition probabilities from one level to another.

enter image description here

  Possible
  transitions    ms02g_PG_al
    │       ┌┬┬┬ C|T 
    │       ││││ T|C | Block 1
    │       ││││ A|C |
    │       ││││ G|T /
    └─────> ││││  ×—————> Break Point
            │││└ T|C 
            ││└─ T|C |
            │└── T|C | Block 2
            └─── T|A /
                 ↓ ↓
             Hap A Hap B

calculating likelyhood

  Block-1-Hap-A with Block-2-Hap-B    vs.    Block-1-Hap-A with Block-2-Hap-A

  C|C × C|C × C|C × A|C = 0.58612            T|C × T|C × T|C × T|C = 0.000244
+ C|T × C|T × C|T × A|T = 0.3164           + T|T × T|T × T|T × T|T = 0.00391
+ C|A × C|A × C|A × A|A = 0.482            + T|A × T|A × T|A × T|A = 0.0007716
+ C|G × C|G × C|G × A|G = 0.3164           + T|G × T|G × T|G × T|G = 0.00391
                          ———————                                    —————————
                    Avg = 0.42523                              Avg = 0.002207

        Likelyhood Ratio = 0.42523 / 0.002207 = 192.67

I have the following working code with python3:
I think the transition probs can be mostly optimized using numpy, or using a markov-chain package. But, I was not able to work them out in the way I desired. Also, it took me sometime to get to what I wanted to do.

Any suggestions with explanation. Thanks.

#!/home/bin/python
from __future__ import division

import collections
from itertools import product
from itertools import islice
import itertools
import csv
from pprint import pprint


''' function to count the number of transitions '''
def sum_Probs(pX_Y, pX):
    try:
        return float(pX_Y / pX)
    except ZeroDivisionError:
        return 0


with open("HaploBlock_Updated.txt", 'w') as f:

    ''' Write header (part 01): This is just a front part of the header. The rear part of the header is
    update dynamically depending upon the number of samples '''
    f.write('t'.join(['chr', 'pos', 'ms02g_PI', 'ms02g_PG_al', 'n']))

    with open('HaploBlock_for_test-forHMMtoy02.txt') as Phased_Data:

        ''' Note: Create a dictionary to store required data. The Dict should contain information about
         two adjacent haplotype blocks that needs extending. In this example I want to extend the
         haplotypes for sample ms02g which has two blocks 6 and 4. So, I read the PI and PG value
         for this sample. Also, data should store with some unique keys. Some keys are: chr, pos,
         sample (PI, PG within sample), possible haplotypes ... etc .. '''


        Phased_Dict = csv.DictReader(Phased_Data, delimiter='t')
        grouped = itertools.groupby(Phased_Dict, key=lambda x: x['ms02g_PI'])


        ''' Function to read the data as blocks (based on PI values of ms02g) at a time'''
        def accumulate(data):
            acc = collections.OrderedDict()
            for d in data:
                for k, v in d.items():
                    acc.setdefault(k, []).append(v)
            return acc


        ''' Store data as keys,values '''
        grouped_data = collections.OrderedDict()
        for k, g in grouped:
            grouped_data[k] = accumulate(g)


        ''' Clear memory '''
        del Phased_Dict
        del grouped

        ''' Iterate over two Hap Blocks at once. This is done to obtain all possible Haplotype
        configurations between two blocks. The (keys,values) for first block is represented as
        k1,v2 and for the later block as k2,v2. '''

        # Empty list for storing haplotypes from each block
        hap_Block1_A = [];
        hap_Block1_B = []
        hap_Block2_A = [];
        hap_Block2_B = []

        # Create empty list for possible haplotype configurations from above block
        hapB1A_hapB2A = []
        hapB1B_hapB2B = []

        ''' list of all available samples (index, value) '''
        sample_list = [('ms01e_PI', 'ms01e_PG_al'), ('ms02g_PI', 'ms02g_PG_al'),
                       ('ms03g_PI', 'ms03g_PG_al'), ('ms04h_PI', 'ms04h_PG_al'),
                       ('ms05h_PI', 'ms05h_PG_al'), ('ms06h_PI', 'ms06h_PG_al')]


        ''' read data from two consecutive blocks at a time '''
        for (k1, v1), (k2, v2) in zip(grouped_data.items(), islice(grouped_data.items(), 1, None)):
            ''' skip if one of the keys has no values'''
            if k1 == '.' or k2 == '.':
                continue


            ''' iterate over the first Haplotype Block, i.e the k1 block.
            The nucleotides in the left of the phased SNPs are called Block01-haplotype-A,
            and similarly on the right as Block01-haplotype-B. '''
            hap_Block1_A = [x.split('|')[0] for x in v1['ms02g_PG_al']] # the left haplotype of Block01
            hap_Block1_B = [x.split('|')[1] for x in v1['ms02g_PG_al']]


            ''' iterate over the second Haplotype Block,
            i.e the k2 block '''
            hap_Block2_A = [x.split('|')[0] for x in v2['ms02g_PG_al']]
            hap_Block2_B = [x.split('|')[1] for x in v2['ms02g_PG_al']]



            ''' Now, we have to start to solve the possible haplotype configuration.
            Possible haplotype Configurations will be, Either :
            1) Block01-haplotype-A phased with Block02-haplotype-A,
                creating -> hapB1A-hapB2A, hapB1B-hapB2B
            Or,
            2) Block01-haplotype-A phased with Block02-haplotype-B
                creating -> hapB1A-hapB2B, hapB1B-hapB2A '''

            ''' First possible configuration '''
            hapB1A_hapB2A = [hap_Block1_A, hap_Block2_A]
            hapB1B_hapB2B = [hap_Block1_B, hap_Block2_B]

            ''' Second Possible Configuration '''
            hapB1A_hapB2B = [hap_Block1_A, hap_Block2_B]
            hapB1B_hapB2A = [hap_Block1_B, hap_Block2_A]


            print('nStarting MarkovChains')
            ''' Now, start preping the first order markov transition matrix
             for the observed haplotypes between two blocks.'''


            ''' To store the sum-values of the product of the transition probabilities.
            These sum are added as the product-of-transition is retured by nested for-loop;
            from "for m in range(....)" '''

            Sum_Of_Product_of_Transition_probabilities_hapB1A_hapB2A = 
                sumOf_PT_hapB1A_B2A = 0
            Sum_Of_Product_of_Transition_probabilities_hapB1B_hapB2B = 
                sumOf_PT_hapB1B_B2B = 0

            Sum_Of_Product_of_Transition_probabilities_hapB1A_hapB2B = 
                sumOf_PT_hapB1A_B2B = 0
            Sum_Of_Product_of_Transition_probabilities_hapB1B_hapB2A = 
                sumOf_PT_hapB1B_B2A = 0


            for n in range(len(v1['ms02g_PI'])):  # n-ranges from block01

                ''' Set the probabilities of each nucleotides at Zero. They are updated for each level
                 of "n" after reading the number of each nucleotides at that position. '''
                pA = 0; pT = 0; pG = 0; pC = 0
                # nucleotide_prob = [0., 0., 0., 0.] # or store as numpy matrix

                # Also storing these values as Dict
                # probably can be improved
                nucleotide_prob_dict = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
                print('prob as Dict: ', nucleotide_prob_dict)


                ''' for storing the product-values of the transition probabilities.
                These are updated for each level of "n" paired with each level of "m". '''
                product_of_transition_Probs_hapB1AB2A = POTP_hapB1AB2A = 1
                product_of_transition_Probs_hapB1BB2B = POTP_hapB1BB2B = 1

                product_of_transition_Probs_hapB1AB2B = POTP_hapB1AB2B = 1
                product_of_transition_Probs_hapB1BB2A = POTP_hapB1BB2A = 1



                ''' Now, we read each level of "m" to compute the transition from each level of "n"
                to each level of "m". '''
                for m in range(len(v2['ms02g_PI'])): # m-ranges from block02
                    # transition for each level of 0-to-n level of V1 to 0-to-m level of V2
                    ''' set transition probabilities at Zero for each possible transition '''
                    pA_A = 0; pA_T = 0; pA_G = 0; pA_C = 0
                    pT_A = 0; pT_T = 0; pT_G = 0; pT_C = 0
                    pG_A = 0; pG_T = 0; pG_G = 0; pG_C = 0
                    pC_A = 0; pC_T = 0; pC_G = 0; pC_C = 0

                    ''' Also, creating an empty dictionary to store transition probabilities
                    - probably upgrade using numpy '''
                    transition_prob_dict = {'A_A' : 0, 'A_T' : 0, 'A_G' : 0, 'A_C' : 0,
                    'T_A' : 0, 'T_T' : 0, 'T_G' : 0, 'T_C' : 0,
                    'G_A' : 0, 'G_T' : 0, 'G_G' : 0, 'G_C' : 0,
                    'C_A' : 0, 'C_T' : 0, 'C_G' : 0, 'C_C' : 0}


                    ''' Now, loop through each sample to compute initial probs and transition probs '''
                    for x, y in sample_list:
                        print('sample x and y:', x,y)


                        ''' Update nucleotide probability for this site
                        - only calculated from v1 and only once for each parse/iteration '''
                        if m == 0:
                            pA += (v1[y][n].count('A'))
                            pT += (v1[y][n].count('T'))
                            pG += (v1[y][n].count('G'))
                            pC += (v1[y][n].count('C'))

                            nucleotide_prob_dict['A'] = pA
                            nucleotide_prob_dict['T'] = pT
                            nucleotide_prob_dict['G'] = pG
                            nucleotide_prob_dict['C'] = pC

                        nucleotide_prob = [pA, pT, pG, pC]


                        ''' Now, update transition matrix '''
                        nucl_B1 = (v1[y][n]).split('|')  # nucleotides at Block01
                        nucl_B2 = (v2[y][m]).split('|')  # nucleotides at Block02


                        ''' create possible haplotype configurations between "n" and "m".
                         If the index (PI value) are same we create zip, if index (PI value) are
                         different we create product. '''

                        HapConfig = []  # create empty list

                        if v1[x][n] == v2[x][m]:
                            ziped_nuclB1B2 = [(x + '_' + y) for (x,y) in zip(nucl_B1, nucl_B2)]
                            HapConfig = ziped_nuclB1B2

                            ''' Move this counting function else where '''
                            pA_A += (HapConfig.count('A_A'))  # (v1[y][0].count('A'))/8
                            pA_T += (HapConfig.count('A_T'))
                            pA_G += (HapConfig.count('A_G'))
                            pA_C += (HapConfig.count('A_C'))

                            pT_A += (HapConfig.count('T_A'))  # (v1[y][0].count('A'))/8
                            pT_T += (HapConfig.count('T_T'))
                            pT_G += (HapConfig.count('T_G'))
                            pT_C += (HapConfig.count('T_C'))

                            pG_A += (HapConfig.count('G_A'))  # (v1[y][0].count('A'))/8
                            pG_T += (HapConfig.count('G_T'))
                            pG_G += (HapConfig.count('G_G'))
                            pG_C += (HapConfig.count('G_C'))

                            pC_A += (HapConfig.count('C_A'))
                            pC_T += (HapConfig.count('C_T'))
                            pC_G += (HapConfig.count('C_G'))
                            pC_C += (HapConfig.count('C_C'))


                        if v1[x][n] != v2[x][m]:
                            prod_nuclB1B2 = [(x + '_' + y) for (x,y) in product(nucl_B1, nucl_B2)]
                            HapConfig = prod_nuclB1B2
                            print('prod NuclB1B2: ', prod_nuclB1B2)

                            pA_A += (HapConfig.count('A_A'))/2
                            pA_T += (HapConfig.count('A_T'))/2
                            pA_G += (HapConfig.count('A_G'))/2
                            pA_C += (HapConfig.count('A_C'))/2

                            pT_A += (HapConfig.count('T_A'))/2
                            pT_T += (HapConfig.count('T_T'))/2
                            pT_G += (HapConfig.count('T_G'))/2
                            pT_C += (HapConfig.count('T_C'))/2

                            pG_A += (HapConfig.count('G_A'))/2
                            pG_T += (HapConfig.count('G_T'))/2
                            pG_G += (HapConfig.count('G_G'))/2
                            pG_C += (HapConfig.count('G_C'))/2

                            pC_A += (HapConfig.count('C_A'))/2
                            pC_T += (HapConfig.count('C_T'))/2
                            pC_G += (HapConfig.count('C_G'))/2
                            pC_C += (HapConfig.count('C_C'))/2


                    ''' Now, compute nucleotide and transition probabilities for each nucleotide
                    from each 0-n to 0-m at each sample. This updates the transition matrix in
                    each loop. **Note: At the end this transition probabilities should sum to 1 '''

                    ''' Storing nucleotide probabilities '''
                    nucleotide_prob = [pA, pT, pG, pC]

                    ''' Storing transition probability as dict'''
                    transition_prob_dict['A_A'] = sum_Probs(pA_A,pA)
                    transition_prob_dict['A_T'] = sum_Probs(pA_T,pA)
                    transition_prob_dict['A_G'] = sum_Probs(pA_G,pA)
                    transition_prob_dict['A_C'] = sum_Probs(pA_C,pA)

                    transition_prob_dict['T_A'] = sum_Probs(pT_A,pT)
                    transition_prob_dict['T_T'] = sum_Probs(pT_T,pT)
                    transition_prob_dict['T_G'] = sum_Probs(pT_G,pT)
                    transition_prob_dict['T_C'] = sum_Probs(pT_C,pT)

                    transition_prob_dict['G_A'] = sum_Probs(pG_A,pG)
                    transition_prob_dict['G_T'] = sum_Probs(pG_T,pG)
                    transition_prob_dict['G_G'] = sum_Probs(pG_G,pG)
                    transition_prob_dict['G_C'] = sum_Probs(pG_C,pG)

                    transition_prob_dict['C_A'] = sum_Probs(pC_A,pC)
                    transition_prob_dict['C_T'] = sum_Probs(pC_T,pC)
                    transition_prob_dict['C_G'] = sum_Probs(pC_G,pC)
                    transition_prob_dict['C_C'] = sum_Probs(pC_C,pC)


                    '''Now, we start solving the haplotype configurations after we have the
                      required data (nucleotide and transition probabilities).
                    Calculate joint probability for each haplotype configuration.
                    Step 01: We calculate the product of the transition from one level
                    of "n" to several levels of "m". '''

                    ''' finding possible configuration between "n" and "m". '''
                    hapB1A_hapB2A_transition = hapB1A_hapB2A[0][n] + '_' + hapB1A_hapB2A[1][m]
                    hapB1B_hapB2B_transition = hapB1B_hapB2B[0][n] + '_' + hapB1B_hapB2B[1][m]

                    hapB1A_hapB2B_transition = hapB1A_hapB2B[0][n] + '_' + hapB1A_hapB2B[1][m]
                    hapB1B_hapB2A_transition = hapB1B_hapB2A[0][n] + '_' + hapB1B_hapB2A[1][m]


                    ''' computing the products of transition probabilities on the for loop '''
                    POTP_hapB1AB2A *= transition_prob_dict[hapB1A_hapB2A_transition]
                    POTP_hapB1BB2B *= transition_prob_dict[hapB1B_hapB2B_transition]

                    POTP_hapB1AB2B *= transition_prob_dict[hapB1A_hapB2B_transition]
                    POTP_hapB1BB2A *= transition_prob_dict[hapB1B_hapB2A_transition]


                ''' Step 02: sum of the product of the transition probabilities '''
                sumOf_PT_hapB1A_B2A += POTP_hapB1AB2A
                sumOf_PT_hapB1B_B2B += POTP_hapB1BB2B

                sumOf_PT_hapB1A_B2B += POTP_hapB1AB2B
                sumOf_PT_hapB1B_B2A += POTP_hapB1BB2A


            ''' Step 03: Now, computing the likely hood of each haplotype configuration '''
            print('computing likelyhood:')

            likelyhood_hapB1A_with_B2A_Vs_B2B = LH_hapB1AwB2AvsB2B = 
                sumOf_PT_hapB1A_B2A / sumOf_PT_hapB1A_B2B
            likelyhood_hapB1B_with_B2B_Vs_B2A = LH_hapB1BwB2BvsB2A = 
                sumOf_PT_hapB1B_B2B / sumOf_PT_hapB1B_B2A

            likelyhood_hapB1A_with_B2B_Vs_B2A = LH_hapB1AwB2BvsB2A = 
                sumOf_PT_hapB1A_B2B / sumOf_PT_hapB1A_B2A
            likelyhood_hapB1B_with_B2A_Vs_B2B = LH_hapB1BwB2AvsB2B = 
                sumOf_PT_hapB1B_B2A / sumOf_PT_hapB1B_B2B


            print('nlikely hood Values: B1A with B2A vs. B2B, B1B with B2B vs. B2A')
            print(LH_hapB1AwB2AvsB2B, LH_hapB1BwB2BvsB2A)

            print('nlikely hood Values: B1A with B2B vs. B2A, B1B with B2A vs. B2B')
            print(LH_hapB1AwB2BvsB2A, LH_hapB1BwB2AvsB2B)


            ''' Update the phase state of the block based on evidence '''
            if (LH_hapB1AwB2AvsB2B + LH_hapB1BwB2BvsB2A) > 
                4*(LH_hapB1AwB2BvsB2A + LH_hapB1BwB2AvsB2B):
                print('Block-1_A is phased with Block-2_A')

                for x in range(len(v1['ms02g_PI'])):
                    PI_value = v1['ms02g_PI'][x]
                    # Note: so, we trasfer the PI value from ealier block to next block

                    f.write('t'.join([v1['chr'][x], v1['pos'][x], v1['ms02g_PI'][x],
                                       hapB1A_hapB2A[0][x] + '|' + hapB1B_hapB2B[0][x], 'n']))

                for x in range(len(v2['ms02g_PI'])):
                    f.write('t'.join([v2['chr'][x], v2['pos'][x], PI_value,
                                       hapB1A_hapB2A[1][x] + '|' + hapB1B_hapB2B[1][x], 'n']))

            elif (LH_hapB1AwB2AvsB2B + LH_hapB1BwB2BvsB2A) < 
                4 * (LH_hapB1AwB2BvsB2A + LH_hapB1BwB2AvsB2B):
                print('Block-1_A is phased with Block-2_B')

                for x in range(len(v1['ms02g_PI'])):
                    PI_value = v1['ms02g_PI'][x]
                    # Note: so, we trasfer the PI value from ealier block to next block...
                        # but, the phase position are reversed

                    f.write('t'.join([v1['chr'][x], v1['pos'][x], v1['ms02g_PI'][x],
                                       hapB1A_hapB2A[0][x] + '|' + hapB1B_hapB2B[0][x], 'n']))

                for x in range(len(v2['ms02g_PI'])):
                    f.write('t'.join([v2['chr'][x], v2['pos'][x], PI_value,
                                       hapB1B_hapB2B[1][x] + '|' + hapB1A_hapB2A[1][x], 'n']))


            else:
                print('cannot solve the phase state with confidence - sorry ')
                for x in range(len(v1['ms02g_PI'])):
                    f.write('t'.join([v1['chr'][x], v1['pos'][x], v1['ms02g_PI'][x],
                                   hapB1A_hapB2A[0][x] + '|' + hapB1B_hapB2B[0][x], 'n']))

                for x in range(len(v2['ms02g_PI'])):
                    f.write('t'.join([v2['chr'][x], v2['pos'][x], v2['ms02g_PI'][x],
                                       hapB1A_hapB2A[1][x] + '|' + hapB1B_hapB2B[1][x], 'n']))


Get this bounty!!!

#StackBounty: #python #numpy #tensorflow #keras #keras-layer InvalidArgumentError: In[0].dim(0) and In[1].dim(0) must be the same: [1,1…

Bounty: 50

I’m trying to create a custom layer merging 2 sources. I am receiving the error “InvalidArgumentError: In[0].dim(0) and In[1].dim(0) must be the same: [1,125,150] vs [32,150,125].” The code runs if I set the batch_size to 1 so then have [1,125,150] vs [1,150,125]; however, the loss then doesn’t change so still not root cause. I think that I need to use batch size instead of just expand dims

class mergeLayer(L.Layer):
    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(mergeLayer,self).__init__()
        self.kernel_initializer = INIT.get('uniform')

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.kernel = self.add_weight(name='kernel',shape=input_shape[1:],initializer=self.kernel_initializer,trainable=True)
        super(mergeLayer,self).build(input_shape) # Be sure to call this somewhere!

    def call(self, x):
        temp = K.batch_dot(tf.expand_dims(self.kernel,0),tf.transpose(x,perm=[0,2,1]))+1
        return temp
    def compute_output_shape(self, input_shape):
        return input_shape

Below is code fitting the model. Again, if I change batch_size to 1 here, I can the code to run but loss stays the same.

modelMerge.fit(x=[train1,train2],y=cats,epochs=100,batch_size=32,shuffle='batch')
score = modelMerge.evaluate(x=[test1,test2],y=cats,batch_size=32)

Output when batch_size is 1

Epoch 1/100
3903/3903 [=========================] - 45s - loss: 15.7062 - acc: 0.0254
Epoch 2/100
3903/3903 [=========================] - 43s - loss: 15.7050 - acc: 0.0254
Epoch 3/100
277/3903 [=>.......................] - ETA: 42s - loss: 15.8272 - acc: 0.0181

Thanks very much for your time and help.


Get this bounty!!!

Distributed Evolutionary Algorithms in Python

DEAP

DEAP is a novel evolutionary computation framework for rapid prototyping and testing of ideas. It seeks to make algorithms explicit and data structures transparent. It works in perfect harmony with parallelization mechanism such as multiprocessing and SCOOP.

DEAP includes the following features:

  • Genetic algorithm using any imaginable representation
    • List, Array, Set, Dictionary, Tree, Numpy Array, etc.
  • Genetic programing using prefix trees
    • Loosely typed, Strongly typed
    • Automatically defined functions
  • Evolution strategies (including CMA-ES)
  • Multi-objective optimisation (NSGA-II, SPEA2, MO-CMA-ES)
  • Co-evolution (cooperative and competitive) of multiple populations
  • Parallelization of the evaluations (and more)
  • Hall of Fame of the best individuals that lived in the population
  • Checkpoints that take snapshots of a system regularly
  • Benchmarks module containing most common test functions
  • Genealogy of an evolution (that is compatible with NetworkX)
  • Examples of alternative algorithms : Particle Swarm Optimization, Differential Evolution, Estimation of Distribution Algorithm

See the DEAP User’s Guide for DEAP documentation.

Installation

We encourage you to use easy_install or pip to install DEAP on your system. Other installation procedure like apt-get, yum, etc. usually provide an outdated version.

pip install deap

The latest version can be installed with

pip install git+https://github.com/DEAP/deap@master

If you wish to build from sources, download or clone the repository and type

python setup.py install

 

Source

#StackBounty: #python #numpy #matplotlib #scipy #statistics scipy.stats.binned_statistic_2d works for count but not mean

Bounty: 50

I have some satellite data which looks like the following (scatter plot):

Night-time Ion Density

I now want to bin this data into a regular grid over time and latitude and have each bin be equal to the mean of the all the data points that fall within it. I have been experimenting with scipy.stats.binned_statistic_2d and am baffled at the results I am getting.

First, if I pass the “count” statistic into the scipy binning function, it appears to work correctly (minimal code and plot below).

id1 = np.ma.masked_where(id1==0, id1) #id1 is the actual data and I have tried using this masking argument and without to the same effect

x_range = np.arange(0,24.25,.25) #setting grid spacing for x and y
y_range = np.arange(-13,14,1)

xbins, ybins = len(x_range), len(y_range) #number of bins in each dimension

H, xedges, yedges, binnumber = stats.binned_statistic_2d(idtime, idlat, values = id1, statistic='count' , bins = [xbins, ybins])  #idtime and idlat are the locations of each id1 value in time and latitude
H = np.ma.masked_where(H==0, H) #masking where there was no data
XX, YY = np.meshgrid(xedges, yedges)

fig = plt.figure(figsize = (13,7))
ax1=plt.subplot(111)
plot1 = ax1.pcolormesh(XX,YY,H.T)

Resulting Plot

Counts

Now if I change the statistic to mean, np.mean, np.ma.mean, etc… this is the plot I get which appears to pick out places there is data and where there is none:

Mean

Even though the min and max values for this data are 612 and 2237026 respectively. I have written some code that does this manually, but it isn’t pretty and takes forever (and I haven’t completely accounted for edge effects so running to error and then fixing it is taking forever).

I would love some advice to get this to work. Thanks!

Edit: I just noticed that I am getting a runtime warning after running the script which I can’t find any information about online. A google search for the warning returns zero results. The warning occurs for every statistic option except for count.

AppDataLocalEnthoughtCanopyedmenvsUserlibsite-packagesmatplotlibcolors.py:494:
RuntimeWarning: invalid value encountered in less cbook._putmask(xa,
xa < 0.0, -1)

Edit2: I am attaching some code below that duplicates my problem. This code works for the statistic count but not for mean or any other statistic. This code produces the same run time warning from before in the same manner.

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

x = np.random.rand(1000)
y = np.random.rand(1000)

z = np.arange(1000)

H, xedges, yedges, binnumber = stats.binned_statistic_2d(x, y, values = z, statistic='count' , bins = [20, 20])
H2, xedges2, yedges2, binnumber2 = stats.binned_statistic_2d(x, y, values = z, statistic='mean' , bins = [20, 20])

XX, YY = np.meshgrid(xedges, yedges)
XX2, YY2 = np.meshgrid(xedges2, yedges2)

fig = plt.figure(figsize = (13,7))
ax1=plt.subplot(111)
plot1 = ax1.pcolormesh(XX,YY,H.T)
cbar = plt.colorbar(plot1,ax=ax1, pad = .015, aspect=10)
plt.show()

fig2 = plt.figure(figsize = (13,7))
ax2=plt.subplot(111)
plot2 = ax2.pcolormesh(XX2,YY2,H2.T)
cbar = plt.colorbar(plot2,ax=ax2, pad = .015, aspect=10)
plt.show()

count_working_code
mean_working_code

Edit 3: User8153 was able to identify the problem. The solution was to mask the array from scipy stats where nans occur. I used np.ma.masked_invalid() to do this. Plots of my original data and test data are below for the mean statistic.

Working Mean My Data
Working Mean Sample Data


Get this bounty!!!

#StackBounty: #python #numpy #matplotlib #scikit-learn Plot specific points in DBSCAN in sklearn in python

Bounty: 50

I have a set of documents and I create a feature matrix from it. Then I calculate cosine similarity between the documents. I input that cosine distance matrix to DBSCAN algorithm. My code is as follows.

import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.cluster import DBSCAN

# Initialize some documents
doc1 = {'Science':0.8, 'History':0.05, 'Politics':0.15, 'Sports':0.1}
doc2 = {'News':0.2, 'Art':0.8, 'Politics':0.1, 'Sports':0.1}
doc3 = {'Science':0.8, 'History':0.1, 'Politics':0.05, 'News':0.1}
doc4 = {'Science':0.1, 'Weather':0.2, 'Art':0.7, 'Sports':0.1}
doc5 = {'Science':0.2, 'Weather':0.7, 'Art':0.8, 'Sports':0.9}
doc6 = {'Science':0.2, 'Weather':0.8, 'Art':0.8, 'Sports':1.0}
collection = [doc1, doc2, doc3, doc4, doc5, doc6]
df = pd.DataFrame(collection)
# Fill missing values with zeros
df.fillna(0, inplace=True)
# Get Feature Vectors
feature_matrix = df.as_matrix()
print(feature_matrix.tolist())

# Get cosine distance between pairs
sims = pairwise_distances(feature_matrix, metric='cosine')

# Fit DBSCAN
db = DBSCAN(min_samples=1, metric='precomputed').fit(sims)

Now, as shown in DBSCAN demo of sklearn I plot the clusters. That is, instead of X I insert sims, which is my cosine distance matrix.

labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
#print(labels)

# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = sims[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = sims[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
  1. My first question is, is it correct to change sims instead of X, because X represents coordinate values in the demo of sklearn whereas sims represent cosine distance values?
  2. My second question is, is it possible to make the given points into red color? For example I want to change the point that reprsents [0.8, 0.0, 0.0, 0.0, 0.2, 0.9, 0.7] of the feature_matrix to red?


Get this bounty!!!

#StackBounty: #python #numpy #matplotlib #fractals Plotting the Mandelbrot set at different zoom levels

Bounty: 50

I’m interested in making an animated movie of a zoom in on a part of the Mandelbrot set. My code works well for a few zooms, but upon trying to zoom in quite far, I find that the fractal becomes “smoothed out”. Am I missing something that prevents me from seeing the fractal structure at higher zoom levels? Am I hitting machine precision in the computations? Here’s what I’m talking about:

Zoom level 3:
zoom level 3
Zoom level 9:
zoom level 9
Zoom level 20:
zoom level 20

The first plot looks good, the second is okay, and the third is not fractal at all.

If there are any other deficiencies or improvements I’d be glad to hear about them as well.

Here’s my code:

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib import animation
import time

# some interesting places in the set
# http://www.nahee.com/Derbyshire/manguide.html

N = 500
nIts = 25
nZooms = 50
x0=0
y0=-1

movie = np.zeros([N,N,nZooms])

x = np.linspace(-2,1,N)
y = np.linspace(-1,1,N)
X,Y = np.meshgrid(x,y)
c = X + 1j*Y
z = 0*c
for i in range(nIts):
    z = z**2 + c

mask = np.abs(z) < 1
z[z>1]=0
z[np.isnan(z)]=0
movie[:,:,0] = mask


# plotting stuff
for j in range(1,nZooms):
    h=1./(2**j)
    print "Plot number ", j
    x = x0+h*np.linspace(-1,1,N)
    y = y0+h*np.linspace(-1,1,N)
    X,Y = np.meshgrid(x,y)
    c = X + 1j*Y
    z = 0*c
    for i in range(nIts):
        z = z**2 + c

    mask = np.abs(z) < 1
    z[z>1]=0
    z[np.isnan(z)]=0
    movie[:,:,j] = mask

fig = plt.figure()

for j in range(nZooms):
    name = "image%d.png" % j
    plt.imshow(movie[:,:,j], cmap = 'RdBu')
    plt.gray()
    plt.axis('equal')
    plt.axis('off')
    # plt.show()
    fig.savefig(name)
    time.sleep(1)


Get this bounty!!!

#StackBounty: Efficient extraction of patch features over an image

Bounty: 50

Input:

  • data – a matrix of size n*m*3*3 (complex values)
  • indices – a list of coordinates (x,y), where x < n and y < m
  • fp – a feature parameter which is a tuple of ((fp11, fp12), (fp21, fp22)), id)
  • reference – a list of 3*3 matrices
  • swrd – a function which computes a similarity value between two complex valued 3*3 matrices

Output:

  • feature_values – a list of features – one feature for each index in (indices)

Functionality:

Given an image (data) were each pixel is a 3*3 matrix. And there is a list of target pixels (indices). For each target pixel, I want to extract features of the patch surrounding it.

A patch feature is either:
a) the swrd of a pixel in the patch with a reference matrix or
b) the swrd of two pixels in the patch

Thus a feature can be described by the relative coordinates fp11, fp12 (x and y offset of pixel of interest 1) and fp21, fp22 (x and y offset of pixel of interest 2).
If fp11 == fp21 and fp12== fp22, then i want to compute a), else i want to compute b).
The reference matrix of interest is defined by the feature parameter called id.

Note that the indices of interest are already filtered so that the sum x+fp__ < n and y+fp__< m for all possible fp__.

Code

Computing the symetric revised wishart distance with regularization in case a matrix A or B is not invertible

def srwd(A, B):
    """This function computes the symetric revised wishart distance as from the paper
    SPECTRAL CLUSTERING OF POLARIMETRIC SAR DATA WITH WISHART-DERIVED DISTANCE MEASURES"""
    try:
        dist = 0.5 * np.trace(np.dot(A, inv(B)) + np.dot(B, inv(A))) - len(A)      
    except:
        A, B = A.reshape(3, 3) + np.eye(3) * 0.000001, B.reshape(3, 3) + np.eye(3) * 0.000001
        dist = 0.5 * np.trace(np.dot(A, inv(B)) + np.dot(B, inv(A))) - len(A)      
    return abs(dist)

Getting the features with the input as given above:

def feature(data, indices, fp, reference):
    # fp is a tuple of 2 coordinates in a patch ((x1,x2),(y1,y2),ref),
    # where ref is an index of a random reference matrix in reference only relevant in case x1=y1 and x2=y2
    res = []
    if fp[0] != fp[1]:
        for i in indices:
            x, y = i
            res.append(srwd(data[x + fp[0][0]][y + fp[0][1]], data[x + fp[1][0]][y + fp[1][1]]))
    else:
        for i in indices:
            x, y = i
            res.append(srwd(data[x + fp[0][0]][y + fp[0][1]], reference[fp[2]]))
    return res

Finally there is another loop such as:

for fp in feature_params:
    feature_values = feature(data, indices, fp, reference)
    #here work on feature_values

The current implementation is rather inefficent and a bottleneck of the whole process. How could I improve it?

Is there a chance to efficiently compute a feature matrix efficiently and operate on it afterwards?

An executable toy example including the whole code is given here (allowing copy-paste)

import numpy as np
from numpy.linalg import inv

#toy example
data = np.random.rand(1000, 1000, 3, 3) #an image of 1000*1000 pixels, each pixel a 3*3 matrix
indices = np.random.randint(3,96, size = (10000,2)) # a list of 10000 target pixels (lets assume they are unique)
reference = [np.random.rand(3,3)] # a single reference matrix in a list (in actual application there are multiple reference matrices)
feature_params = [((0,0),(-1,-1), 0), ((0,0), (0,0), 0), ((0,1), (0,0), 0), ((1,0), (0,0), 0), ((1,1), (0,0), 0)] 



def srwd(A, B):
    """This function computes the symetric revised wishart distance as from the paper
    SPECTRAL CLUSTERING OF POLARIMETRIC SAR DATA WITH WISHART-DERIVED DISTANCE MEASURES"""
    try:
        dist = 0.5 * np.trace(np.dot(A, inv(B)) + np.dot(B, inv(A))) - len(A)      
    except:
        A, B = A.reshape(3, 3) + np.eye(3) * 0.000001, B.reshape(3, 3) + np.eye(3) * 0.000001
        dist = 0.5 * np.trace(np.dot(A, inv(B)) + np.dot(B, inv(A))) - len(A)      
    return abs(dist)


def feature(data, indices, fp, reference):
    # fp is a tuple of 2 coordinates in a patch ((x1,x2),(y1,y2),ref),
    # where ref is an index of a random reference matrix in reference only relevant in case x1=y1 and x2=y2
    res = []
    if fp[0] != fp[1]:
        for i in indices:
            x, y = i
            res.append(srwd(data[x + fp[0][0]][y + fp[0][1]], data[x + fp[1][0]][y + fp[1][1]]))
    else:
        for i in indices:
            x, y = i
            res.append(srwd(data[x + fp[0][0]][y + fp[0][1]], reference[fp[2]]))
    return res

for fp in feature_params:
    feature_values = feature(data, indices, fp, reference)        
    #here work on feature_values

A final note about the dimensions of the actual problem:

  • Image of size 6000*1700,
  • around 500 features in feature_params
  • indices is list of around 8.000.000 target indices


Get this bounty!!!