#StackBounty: #performance #c #memory-management Tail implementation using a dynamic queue library in C

Bounty: 50

As per K & R exercise 5-13 I’ve written my own version of tail in C. It uses a newer version of a library for a dynamic queue that I also wrote and submitted here before (I’ve made a lot of changes to it since then). I’d like to get some feedback on how my code performs, both memory wise and speed wise, and how I might improve this in the future. I’ve compared its performance to the GNU implementation of tail and found that for small files my program uses less memory but for larger files it uses a fair bit more (although I did find that GNU tail leaks memory – 96 bytes according to Valgrind) and I was hoping I could get some insight as to how it does this better.

tail.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dqueue.h>

#define MAX_LEN 256
#define MAX_LINES 32
#define DEFAULT_TAIL 10

int getlinep(char *s, int lim);

int main(int argc, char *argv[])
{
    char *line;
    char *temp;
    int tail_len = DEFAULT_TAIL;
    queue_t * queue = NULL;
    int elements;
    int len;

    if(!(queue = queue_init(sizeof(char *)))) {
        fprintf(stderr, "Error %d: Could not initialise queue!n", MEM_ERROR);
        return MEM_ERROR;
    }

    if(argc >= 2) {
        if(atoi(*(++argv))) {
            tail_len = -atoi(*argv);
            if(tail_len <= 0 || tail_len > MAX_LEN)
                tail_len = DEFAULT_TAIL;
        }
    }

    for(elements = 0; elements < tail_len; elements++) {
        if(!(line = malloc(MAX_LEN))) {
            fprintf(stderr, "Error: Memory allocation failure!n");
            return MEM_ERROR;
        }

        if(!getlinep(line, MAX_LEN)) {
            free(line);

            if(elements == 0) {
                queue_destroy(&queue);
                return EXIT_SUCCESS;
            }

            break;
        }

        queue_push(&line, queue);
    }

    if(elements == tail_len) {
        if(!(temp = malloc(MAX_LEN))) {
            fprintf(stderr, "Error: Memory allocation failure!n");
            return MEM_ERROR;
        }
        for(;;) {
            if(!(len = getlinep(temp, MAX_LEN))) {
                free(temp);
                break;
            }

            queue_pop(&line, queue);
            memcpy(line, temp, len + 1);
            queue_push(&line, queue);
        }
    }

    for(; elements > 0; elements--) {
        queue_pop(&line, queue);
        printf("%s", line);
        free(line);
    }

    queue_destroy(&queue);

    return EXIT_SUCCESS;
}

int getlinep(char *s, int lim)
{
    int i;

    for(i = 0; i < lim - 1 && (*s = getchar()) != EOF && *s != 'n'; i++, s++)
        ;

    if(*s == 'n') {
        s++;
        i++;
    }

    *s = '';

    return i;
}

dqueue.h

#ifndef DQUEUE_H
#define DQUEUE_H

#include <stdlib.h>                                                     /* Needed for size_t */

#define QUEUE_OK 0
#define MEM_ERROR -1                                                    /* Memory allocation error */
#define SIZE_ERROR -2                                                   /* Queue dimension error */
#define INDEX_ERROR -3                                                  /* No data at given index */

#define BLOCK_SIZE 1024

typedef struct queue_element_t
{
    void * data;                                                        /* Contains the data stored at this node */
    void * next;                                                        /* Contains the pointer to the next element, or NULL if it's the tail node */
} queue_element_t;

typedef struct
{
    queue_element_t *   head;                                           /* Pointer to the head of the queue */
    queue_element_t *   tail;                                           /* Pointer to the tail of the queue */
    size_t              size;                                           /* The number of elements in the queue */
    size_t              element_width;                                  /* The size of each element in the queue */
    size_t              tail_pos;                                       /* The byte offset of the data being pushed into the queue (i.e. in the tail block) */
    size_t              head_pos;                                       /* The byte offset of the data being popped out of the queue (i.e. in the head block) */
    int                 status;
} queue_t;

queue_t *   queue_init(size_t element_size);                            /* Initialise the queue data structure */
void        queue_pop(void * const element, queue_t * queue);           /* Pop an element from the front of the queue, deals with cleanup when the head node is empty */
int         queue_push(const void * const element, queue_t * queue);    /* Push an element to the back of the queue, creates a new block when tail node is full */
int         queue_debug(const queue_t * const queue);                   /* Print information about the queue, returns the queue status if a valid queue pointer is given  */
void        queue_destroy(queue_t * queue);                         /* Destroy the queue data structure and any associated nodes */

#endif

dqueue.c

/* 
 * Filename:    dqueue.c
 * Author:      Alexis Ferguson (highentropystring@gmail.com)
 * Date:        17/02/18
 * Licence:     GNU GPL V3
 *
 * Library for a lightweight, generic, and dynamically allocated queue
 *
 * Return/exit codes:
 *  QUEUE_OK        - No error
 *  SIZE_ERROR      - Queue size error (invalid block size or number of elements)
 *  MEM_ERROR       - Memory allocation error
 *  INDEX_ERROR     - Couldn't pop data from the queue
 *
 * All functions returning pointers will return NULL on memory allocation faliure, else they will specify an error in queue->status for the user to handle
 *
 * Todo:
 *  - Add secure versions of queue_destroy() and queue_pop() to overwrite memory blocks that are no longer in use
 * 
 */

#include <dqueue.h>
#include <stdio.h>
#include <string.h>

queue_t * queue_init(size_t element_width)
{
    queue_t * queue;

    if(!(queue = malloc(sizeof(queue_t))))
        return NULL;

    if(BLOCK_SIZE % element_width != 0 || (queue->element_width = element_width) <= 0) {
        queue->status = SIZE_ERROR;
        return queue;
    }

    queue->tail_pos = 0;
    queue->head_pos = 0;
    queue->tail     = NULL;
    queue->head     = NULL;
    queue->size     = 0;
    queue->status   = QUEUE_OK;

    return queue;
}

void queue_destroy(queue_t * queue)
{
    queue_element_t * temp;

    if(queue == NULL)
        return;

    while(queue->head) {
        free(queue->head->data);
        temp = queue->head->next;
        free(queue->head);
        queue->head = temp;
    }

    queue->size             = 0;
    queue->status           = 0;
    queue->element_width    = 0;
    queue->tail_pos         = 0;
    queue->head_pos         = 0;
    queue->tail             = NULL;

    free(queue);
}

int queue_push(const void * const element, queue_t * queue)
{
    queue_element_t * new_element;

    if(queue->tail_pos == 0) {
        if(!(new_element = malloc(sizeof(queue_element_t)))) {
            queue->status = MEM_ERROR;
            return queue->status;
        }

        if(!(new_element->data = malloc(BLOCK_SIZE))) {
            free(new_element);
            queue->status = MEM_ERROR;
            return queue->status;
        }

        if(queue->head == NULL)
            queue->head = new_element;
        else
            queue->tail->next = new_element;

        queue->tail = new_element;
        queue->tail->next = NULL;
        queue->size++;
    }

    memcpy(queue->tail->data + queue->tail_pos, element, queue->element_width); 

    queue->tail_pos += queue->element_width;

    if(queue->tail_pos >= BLOCK_SIZE)
        queue->tail_pos = 0;

    return queue->status;
}

void queue_pop(void * const element, queue_t * queue)
{
    queue_element_t * temp;

    if(queue->head == NULL || ((queue->head == queue->tail) && (queue->head_pos == queue->tail_pos))) {
        if(queue->tail_pos == 0) { /* Catch an error related to reseting the tail position and incrementing a block after a block has been filled */
            queue->tail_pos = BLOCK_SIZE;
        } else {
            queue->status = INDEX_ERROR;
            return;
        }
    }

    memcpy(element, queue->head->data + queue->head_pos, queue->element_width);

    queue->head_pos += queue->element_width;

    if(queue->head_pos >= BLOCK_SIZE) {
        free(queue->head->data);
        temp = queue->head;
        queue->head = queue->head->next;
        free(temp);

        queue->head_pos = 0;
        queue->size--;
    }
}

int queue_debug(const queue_t * const queue)
{
    if(queue == NULL) {
        printf("Error: Invalid queue pointer!n");
        return MEM_ERROR;
    }

    if(queue->status == QUEUE_OK) {
        printf("Queue is %d blocks long with an element width of %d bytes with each block containing %d elementsnQueue head is at %p and the current element is %pn", (int)queue->size, (int)queue->element_width, BLOCK_SIZE / (int)queue->element_width, (void *)queue->head, (void *)queue->tail);
    } else if(queue->status == MEM_ERROR) {
        printf("Memory error in queue!n");
    } else if(queue->status == SIZE_ERROR) {
        printf("Size error in queue!n");
    } else if(queue->status == INDEX_ERROR) {
        printf("Index error in queue!n");
    }

    return queue->status;
}


Get this bounty!!!

#StackBounty: #8 #performance Performance is notoriously slow

Bounty: 100

This feels like a repeat post but I can’t find an actual answer. I just installed Drupal 8.4.4 on an Azure App Service (S3 Standard 4core 7GB RAM / PHP 7.0.6) and Azure Database for MySQL (Standard 100 computes / v5.6.26.0). I’ve been told the host isn’t the issue.

Out of the box, the admin side is incredibly slow, like 4-5 seconds per click. I enabled page caching, and CSS/JS are aggregating. Dog slow. I worked through this and created my content types, added a couple of extensions, and enabled BigPipe (reported to help but didn’t).

What am I missing? How do I improve performance before I ask my content authors to start loading up content?

Status report from Drupal. (I have since fixed the reported trusted host settings error)

Performance of the page in Chrome. Looks like 2.8s waiting on the page

I created a test.php file, and queried mysql directly, the key_value table, and dumped all of that to the page, about 576 rows, and it returned instantly. Appears to be Drupal specifically?

Devel bar on public page 182 DB queries

Devel bar on admin page 233 DB queries


Get this bounty!!!

#StackBounty: #c #linux #performance #epoll #event-loop Eventloop has high ksoftirqd load; nginx does not but does same system-calls. W…

Bounty: 50

I wrote some code that has an epoll-eventloop, accepts new connections and pretends to be a http-server.
The posted code is the absolute minimum … I removed everything (including all error-checks) to make it as short and to the point as possible:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/socket.h>
#include <netinet/ip.h>
#include <netinet/in.h>
#include <sys/uio.h>
#include <unistd.h>


int main () {
    int servFd = socket (AF_INET, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_IP);
    int value = 1;
    setsockopt (servFd, SOL_SOCKET, SO_REUSEADDR, &value, sizeof (value));

    struct sockaddr_in servAddr;
    memset (&servAddr, 0, sizeof (servAddr));
    servAddr.sin_family = AF_INET;
    servAddr.sin_addr.s_addr = 0;
    servAddr.sin_port = htons (8081);
    bind (servFd, (struct sockaddr*)&servAddr, sizeof (servAddr));
    listen (servFd, 511);

    int efd = epoll_create1 (EPOLL_CLOEXEC);
    struct epoll_event epollEvt;
    epollEvt.events = EPOLLIN | EPOLLRDHUP;
    epollEvt.data.u32 = servFd;
    epoll_ctl (efd, EPOLL_CTL_ADD, servFd, &epollEvt);

    for (;;) {
        struct epoll_event pollEvent[512];
        int eventCount = epoll_wait (efd, pollEvent, 512, -1);
        for (int i = 0; i < eventCount; ++i) {
            struct epoll_event* curEvent = &pollEvent[i];
            if (curEvent->data.u32 == servFd) {
                int clientFd = accept4 (servFd, NULL, NULL, SOCK_NONBLOCK | SOCK_CLOEXEC);
                struct epoll_event epollEvt;
                epollEvt.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
                epollEvt.data.u32 = clientFd;
                epoll_ctl (efd, EPOLL_CTL_ADD, clientFd, &epollEvt);
                continue;
            }

            int clientFd = curEvent->data.u32;
            char recvBuffer[2048];
            recvfrom (clientFd, recvBuffer, 2048, 0, NULL, NULL);
            char sndMsg[] = "HTTP/1.0 200 OKnServer: TestnDate: Sun, 14 May 2017 15:40:26 GMTnContent-type: text/htmlnnHello world!";
            size_t sndMsgLength = sizeof (sndMsg) - 1;
            struct iovec sndBuffer;
            sndBuffer.iov_base = sndMsg;
            sndBuffer.iov_len = sndMsgLength;
            writev (clientFd, &sndBuffer, 1);
            close (clientFd);
        }
    }
    return 0;
}

localhost:~# gcc -Wall test.c -o test

localhost:~# gcc --version
gcc (Alpine 6.4.0) 6.4.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

I did some load-testing of this code and compared it with nginx to see if I did something wrong or if there is something to improve. I expected this code to be the fastest possible implementation since every other “real” webserver has to do a lot more stuff in userspace. But still … somehow nginx beats it in requests per second when using multiple load-generator threads. (Note that I configured nginx to use just one worker to handle every request)

//ab running 1 worker from local machine 
localhost:~# ab -n 100000 -c 1 http://10.0.0.111:8081/index.html
Requests per second:    13228.07 [#/sec] (mean)  //[to nginx]
Requests per second:    15300.35 [#/sec] (mean)  //[to testcode]
//ab running 4 worker from local machine 
localhost:~# ab -n 100000 -c 4 http://10.0.0.111:8081/index.html
Requests per second:    30902.63 [#/sec] (mean)  //[to nginx]
Requests per second:    24734.76 [#/sec] (mean)  //[to testcode]

The first test has the expected result … the test code is faster since it doesn’t do anything except generating a hard-coded response. But why is nginx faster in a multi-threading setting? How can that be?
The only explanation I can come up with is that there is something different in kernel-space and that nginx uses some sockopts (like TCP_FASTOPEN or TCP_DEFER_ACCEPT) or maybe some other system-calls to do its thing. Thats why I did some straces and made my code do the exact same thing as nginx does (from a kernel-perspective) –> you can see the strace attached below. Still … it is faster and I don’t understand why.

//ab running 50 worker from remote machine 
localhost:~# ab -n 100000 -c 50 http://10.0.0.111:8081/index.html
Requests per second:    27507.60 [#/sec] (mean)  //[to nginx]
Requests per second:    24208.51 [#/sec] (mean)  //[to testcode]

This test-cast has the exact same result but I noticed some difference in CPU-usage.

  • My test-code runs at about 60% CPU-load and ksoftirqd/0 runs at about 80%
  • nginx runs at about 99% CPU-load and ksoftirqd/0 runs at just 30%
  • ksoftirqd/0 has no noticeable CPU-load in the local-host setting in both cases

sTrace of nginx:

localhost:~# strace -tt -f /usr/sbin/nginx 
13:28:20.413497 execve("/usr/sbin/nginx", ["/usr/sbin/nginx"], 0x7a59e3d96490 /* 16 vars */) = 0
13:28:20.413987 arch_prctl(ARCH_SET_FS, 0x74ae1cf94b88) = 0
13:28:20.414161 set_tid_address(0x74ae1cf94bc0) = 2186
13:28:20.414350 open("/etc/ld-musl-x86_64.path", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
13:28:20.414519 open("/lib/libpcre.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
13:28:20.414679 open("/usr/local/lib/libpcre.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
13:28:20.414886 open("/usr/lib/libpcre.so.1", O_RDONLY|O_CLOEXEC) = 3
13:28:20.415067 fcntl(3, F_SETFD, FD_CLOEXEC) = 0
13:28:20.415230 fstat(3, {st_mode=S_IFREG|0755, st_size=370360, ...}) = 0
13:28:20.415415 read(3, "177ELF2113>1@24"..., 960) = 960
13:28:20.415599 mmap(NULL, 2469888, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x74ae1caa9000
13:28:20.415809 mmap(0x74ae1cd02000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3, 0x59000) = 0x74ae1cd02000
13:28:20.416020 close(3)                = 0
13:28:20.416218 open("/lib/libssl.so.44", O_RDONLY|O_CLOEXEC) = 3
13:28:20.416396 fcntl(3, F_SETFD, FD_CLOEXEC) = 0
13:28:20.416517 fstat(3, {st_mode=S_IFREG|0755, st_size=309664, ...}) = 0
13:28:20.416692 read(3, "177ELF2113>1pv1"..., 960) = 960
13:28:20.416939 mmap(NULL, 2408448, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x74ae1c85d000
13:28:20.417120 mmap(0x74ae1caa1000, 32768, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3, 0x44000) = 0x74ae1caa1000
13:28:20.417337 close(3)                = 0
13:28:20.417504 open("/lib/libcrypto.so.42", O_RDONLY|O_CLOEXEC) = 3
13:28:20.417644 fcntl(3, F_SETFD, FD_CLOEXEC) = 0
13:28:20.417802 fstat(3, {st_mode=S_IFREG|0755, st_size=1714280, ...}) = 0
13:28:20.418090 read(3, "177ELF2113>10046"..., 960) = 960
13:28:20.418269 mmap(NULL, 3825664, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x74ae1c4b7000
13:28:20.418472 mmap(0x74ae1c836000, 159744, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3, 0x17f000) = 0x74ae1c836000
13:28:20.418808 mmap(0x74ae1c859000, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x74ae1c859000
13:28:20.419067 close(3)                = 0
13:28:20.419280 open("/lib/libz.so.1", O_RDONLY|O_CLOEXEC) = 3
13:28:20.419478 fcntl(3, F_SETFD, FD_CLOEXEC) = 0
13:28:20.419716 fstat(3, {st_mode=S_IFREG|0755, st_size=91952, ...}) = 0
13:28:20.419901 read(3, "177ELF2113>1260!"..., 960) = 960
13:28:20.420065 mmap(NULL, 2191360, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x74ae1c2a0000
13:28:20.420246 mmap(0x74ae1c4b5000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3, 0x15000) = 0x74ae1c4b5000
13:28:20.420429 close(3)                = 0
13:28:20.420621 mprotect(0x74ae1cd02000, 4096, PROT_READ) = 0
13:28:20.420932 mprotect(0x74ae1caa1000, 16384, PROT_READ) = 0
13:28:20.421552 mprotect(0x74ae1c836000, 118784, PROT_READ) = 0
13:28:20.421794 mprotect(0x74ae1c4b5000, 4096, PROT_READ) = 0
13:28:20.422001 mprotect(0x74ae1cf91000, 4096, PROT_READ) = 0
13:28:20.422309 mprotect(0xd10421a5000, 8192, PROT_READ) = 0
13:28:20.422553 brk(NULL)               = 0xd104602de80
13:28:20.422687 brk(0xd1046032000)      = 0xd1046032000
13:28:20.423269 brk(0xd1046033000)      = 0xd1046033000
13:28:20.423621 brk(0xd1046034000)      = 0xd1046034000
13:28:20.423875 brk(0xd1046035000)      = 0xd1046035000
13:28:20.424206 brk(0xd1046036000)      = 0xd1046036000
13:28:20.424570 brk(0xd1046037000)      = 0xd1046037000
13:28:20.424861 brk(0xd1046038000)      = 0xd1046038000
13:28:20.425098 brk(0xd1046039000)      = 0xd1046039000
13:28:20.425435 brk(0xd104603a000)      = 0xd104603a000
13:28:20.425605 brk(0xd104603b000)      = 0xd104603b000
13:28:20.425826 brk(0xd104603c000)      = 0xd104603c000
13:28:20.426096 brk(0xd104603d000)      = 0xd104603d000
13:28:20.426369 open("/etc/localtime", O_RDONLY|O_NONBLOCK|O_CLOEXEC) = 3
13:28:20.426549 fstat(3, {st_mode=S_IFREG|0644, st_size=127, ...}) = 0
13:28:20.426723 mmap(NULL, 127, PROT_READ, MAP_SHARED, 3, 0) = 0x74ae1cf8c000
13:28:20.426847 close(3)                = 0
13:28:20.427023 getpid()                = 2186
13:28:20.427164 open("/var/lib/nginx/logs/error.log", O_WRONLY|O_CREAT|O_APPEND, 0644) = 3
13:28:20.427341 brk(0xd104603e000)      = 0xd104603e000
13:28:20.427503 open("/etc/ssl/openssl.cnf", O_RDONLY) = 4
13:28:20.427680 brk(0xd104603f000)      = 0xd104603f000
13:28:20.427819 readv(4, [{iov_base="", iov_len=0}, {iov_base="[ req ]n#default_bitstt= 2048n#d"..., iov_len=1024}], 2) = 745
13:28:20.428089 brk(0xd1046040000)      = 0xd1046040000
13:28:20.428243 readv(4, [{iov_base="", iov_len=0}, {iov_base="", iov_len=1024}], 2) = 0
13:28:20.428476 close(4)                = 0
13:28:20.428718 brk(0xd1046041000)      = 0xd1046041000
13:28:20.428880 brk(0xd1046042000)      = 0xd1046042000
13:28:20.429179 brk(0xd1046043000)      = 0xd1046043000
13:28:20.429319 brk(0xd1046044000)      = 0xd1046044000
13:28:20.429552 brk(0xd1046045000)      = 0xd1046045000
13:28:20.429775 brk(0xd1046046000)      = 0xd1046046000
13:28:20.429935 brk(0xd1046047000)      = 0xd1046047000
13:28:20.430220 brk(0xd1046048000)      = 0xd1046048000
13:28:20.430391 brk(0xd1046049000)      = 0xd1046049000
13:28:20.430515 brk(0xd104604b000)      = 0xd104604b000
13:28:20.430796 brk(0xd104604c000)      = 0xd104604c000
13:28:20.431042 brk(0xd104604d000)      = 0xd104604d000
13:28:20.431238 brk(0xd104604e000)      = 0xd104604e000
13:28:20.431396 brk(0xd104604f000)      = 0xd104604f000
13:28:20.431581 brk(0xd1046050000)      = 0xd1046050000
13:28:20.431820 brk(0xd1046051000)      = 0xd1046051000
13:28:20.432112 brk(0xd1046054000)      = 0xd1046054000
13:28:20.432374 brk(0xd1046055000)      = 0xd1046055000
13:28:20.432509 brk(0xd1046056000)      = 0xd1046056000
13:28:20.432666 brk(0xd1046057000)      = 0xd1046057000
13:28:20.432836 brk(0xd1046058000)      = 0xd1046058000
13:28:20.433004 brk(0xd1046059000)      = 0xd1046059000
13:28:20.433203 brk(0xd104605a000)      = 0xd104605a000
13:28:20.433400 brk(0xd104605b000)      = 0xd104605b000
13:28:20.433610 brk(0xd104605c000)      = 0xd104605c000
13:28:20.433740 brk(0xd104605d000)      = 0xd104605d000
13:28:20.433895 brk(0xd104605e000)      = 0xd104605e000
13:28:20.434020 brk(0xd104605f000)      = 0xd104605f000
13:28:20.434240 brk(0xd1046060000)      = 0xd1046060000
13:28:20.434419 brk(0xd1046061000)      = 0xd1046061000
13:28:20.434622 uname({sysname="Linux", nodename="localhost", ...}) = 0
13:28:20.434801 sched_getaffinity(0, 128, [0, 1, 2, 3, 4, 5]) = 32
13:28:20.435077 prlimit64(0, RLIMIT_NOFILE, NULL, {rlim_cur=1024, rlim_max=4*1024}) = 0
13:28:20.435260 brk(0xd1046066000)      = 0xd1046066000
13:28:20.435424 uname({sysname="Linux", nodename="localhost", ...}) = 0
13:28:20.435578 brk(0xd104606b000)      = 0xd104606b000
13:28:20.435700 open("/etc/nginx/nginx.conf", O_RDONLY) = 4
13:28:20.435912 fstat(4, {st_mode=S_IFREG|0644, st_size=2781, ...}) = 0
13:28:20.436115 pread64(4, "nnnuser nginx;npcre_jit on;n#tim"..., 2781, 0) = 2781
13:28:20.436284 geteuid()               = 0
13:28:20.436440 open("/etc/passwd", O_RDONLY|O_CLOEXEC) = 5
13:28:20.436596 fcntl(5, F_SETFD, FD_CLOEXEC) = 0
13:28:20.436725 fcntl(5, F_SETFD, FD_CLOEXEC) = 0
13:28:20.436857 readv(5, [{iov_base="", iov_len=0}, {iov_base="root:x:0:0:root:/root:/bin/ashnb"..., iov_len=1024}], 2) = 1024
13:28:20.437047 readv(5, [{iov_base="", iov_len=0}, {iov_base="sbin/nologinnntp:x:123:123:NTP:/"..., iov_len=1024}], 2) = 397
13:28:20.437235 lseek(5, -43, SEEK_CUR) = 1378
13:28:20.437353 close(5)                = 0
13:28:20.437520 open("/etc/group", O_RDONLY|O_CLOEXEC) = 5
13:28:20.437684 fcntl(5, F_SETFD, FD_CLOEXEC) = 0
13:28:20.437800 fcntl(5, F_SETFD, FD_CLOEXEC) = 0
13:28:20.437933 readv(5, [{iov_base="", iov_len=0}, {iov_base="root:x:0:rootnbin:x:1:root,bin,d"..., iov_len=1024}], 2) = 776
13:28:20.438097 close(5)                = 0
13:28:20.438240 epoll_create1(0)        = 5
13:28:20.438429 close(5)                = 0
13:28:20.438681 brk(0xd1046070000)      = 0xd1046070000
13:28:20.438842 brk(0xd1046072000)      = 0xd1046072000
13:28:20.439053 brk(0xd1046074000)      = 0xd1046074000
13:28:20.439175 brk(0xd1046076000)      = 0xd1046076000
13:28:20.439418 brk(0xd104607b000)      = 0xd104607b000
13:28:20.439655 mmap(NULL, 1052672, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x74ae1ce8b000
13:28:20.439886 brk(0xd1046080000)      = 0xd1046080000
13:28:20.440020 brk(0xd1046085000)      = 0xd1046085000
13:28:20.440225 open("/etc/nginx/mime.types", O_RDONLY) = 5
13:28:20.440380 fstat(5, {st_mode=S_IFREG|0644, st_size=3957, ...}) = 0
13:28:20.440523 pread64(5, "ntypes {n    text/html          "..., 3957, 0) = 3957
13:28:20.440725 close(5)                = 0
13:28:20.440977 brk(0xd104608a000)      = 0xd104608a000
13:28:20.441297 brk(0xd104608c000)      = 0xd104608c000
13:28:20.441453 close(4)                = 0
13:28:20.441587 mkdir("/var/tmp/nginx/client_body", 0700) = -1 EEXIST (File exists)
13:28:20.441814 stat("/var/tmp/nginx/client_body", {st_mode=S_IFDIR|0700, st_size=4096, ...}) = 0
13:28:20.442022 mkdir("/var/tmp/nginx/proxy", 0700) = -1 EEXIST (File exists)
13:28:20.442149 stat("/var/tmp/nginx/proxy", {st_mode=S_IFDIR|0700, st_size=4096, ...}) = 0
13:28:20.442257 mkdir("/var/tmp/nginx/fastcgi", 0700) = -1 EEXIST (File exists)
13:28:20.442407 stat("/var/tmp/nginx/fastcgi", {st_mode=S_IFDIR|0700, st_size=4096, ...}) = 0
13:28:20.442568 mkdir("/var/tmp/nginx/uwsgi", 0700) = -1 EEXIST (File exists)
13:28:20.442732 stat("/var/tmp/nginx/uwsgi", {st_mode=S_IFDIR|0700, st_size=4096, ...}) = 0
13:28:20.442945 mkdir("/var/tmp/nginx/scgi", 0700) = -1 EEXIST (File exists)
13:28:20.443051 stat("/var/tmp/nginx/scgi", {st_mode=S_IFDIR|0700, st_size=4096, ...}) = 0
13:28:20.443229 open("/var/log/nginx/access.log", O_WRONLY|O_CREAT|O_APPEND, 0644) = 4
13:28:20.443417 fcntl(4, F_SETFD, FD_CLOEXEC) = 0
13:28:20.443586 open("/var/log/nginx/error.log", O_WRONLY|O_CREAT|O_APPEND, 0644) = 5
13:28:20.443750 fcntl(5, F_SETFD, FD_CLOEXEC) = 0
13:28:20.443889 open("/var/lib/nginx/logs/error.log", O_WRONLY|O_CREAT|O_APPEND, 0644) = 6
13:28:20.444040 fcntl(6, F_SETFD, FD_CLOEXEC) = 0
13:28:20.444197 mmap(NULL, 2097152, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0) = 0x74ae1c0a0000
13:28:20.444382 socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 7
13:28:20.444515 setsockopt(7, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
13:28:20.444680 ioctl(7, FIONBIO, [1])  = 0
13:28:20.444808 bind(7, {sa_family=AF_INET, sin_port=htons(8081), sin_addr=inet_addr("0.0.0.0")}, 16) = 0
13:28:20.445015 listen(7, 511)          = 0
13:28:20.445140 listen(7, 511)          = 0
13:28:20.445326 mmap(NULL, 65536, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x74ae1ce7b000
13:28:20.445493 prlimit64(0, RLIMIT_NOFILE, NULL, {rlim_cur=1024, rlim_max=4*1024}) = 0
13:28:20.445671 mmap(NULL, 1280, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0) = 0x74ae1ce7a000
13:28:20.445817 rt_sigprocmask(SIG_UNBLOCK, [RT_1 RT_2], NULL, 8) = 0
13:28:20.445977 rt_sigaction(SIGHUP, {sa_handler=0xd1041f1f3fc, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x74ae1cd4a6cf}, NULL, 8) = 0
13:28:20.446097 rt_sigaction(SIGUSR1, {sa_handler=0xd1041f1f3fc, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x74ae1cd4a6cf}, NULL, 8) = 0
13:28:20.446247 rt_sigaction(SIGWINCH, {sa_handler=0xd1041f1f3fc, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x74ae1cd4a6cf}, NULL, 8) = 0
13:28:20.446438 rt_sigaction(SIGTERM, {sa_handler=0xd1041f1f3fc, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x74ae1cd4a6cf}, NULL, 8) = 0
13:28:20.446635 rt_sigaction(SIGQUIT, {sa_handler=0xd1041f1f3fc, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x74ae1cd4a6cf}, NULL, 8) = 0
13:28:20.446886 rt_sigaction(SIGUSR2, {sa_handler=0xd1041f1f3fc, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x74ae1cd4a6cf}, NULL, 8) = 0
13:28:20.447093 rt_sigaction(SIGALRM, {sa_handler=0xd1041f1f3fc, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x74ae1cd4a6cf}, NULL, 8) = 0
13:28:20.447236 rt_sigaction(SIGINT, {sa_handler=0xd1041f1f3fc, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x74ae1cd4a6cf}, NULL, 8) = 0
13:28:20.447446 rt_sigaction(SIGIO, {sa_handler=0xd1041f1f3fc, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x74ae1cd4a6cf}, NULL, 8) = 0
13:28:20.447767 rt_sigaction(SIGCHLD, {sa_handler=0xd1041f1f3fc, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x74ae1cd4a6cf}, NULL, 8) = 0
13:28:20.447888 rt_sigaction(SIGSYS, {sa_handler=SIG_IGN, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x74ae1cd4a6cf}, NULL, 8) = 0
13:28:20.448094 rt_sigaction(SIGPIPE, {sa_handler=SIG_IGN, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x74ae1cd4a6cf}, NULL, 8) = 0
13:28:20.448253 rt_sigprocmask(SIG_BLOCK, ~[], [], 8) = 0
13:28:20.448396 fork(strace: Process 2187 attached
)                  = 2187
[pid  2187] 13:28:20.448594 gettid( <unfinished ...>
[pid  2186] 13:28:20.448643 rt_sigprocmask(SIG_SETMASK, [],  <unfinished ...>
[pid  2187] 13:28:20.448671 <... gettid resumed> ) = 2187
[pid  2186] 13:28:20.448700 <... rt_sigprocmask resumed> NULL, 8) = 0
[pid  2187] 13:28:20.448765 rt_sigprocmask(SIG_SETMASK, [],  <unfinished ...>
[pid  2186] 13:28:20.448792 exit_group(0 <unfinished ...>
[pid  2187] 13:28:20.448812 <... rt_sigprocmask resumed> NULL, 8) = 0
[pid  2186] 13:28:20.448836 <... exit_group resumed>) = ?
[pid  2187] 13:28:20.448854 getpid()    = 2187
[pid  2187] 13:28:20.448951 setsid( <unfinished ...>
[pid  2186] 13:28:20.449046 +++ exited with 0 +++
13:28:20.449055 <... setsid resumed> )  = 2187
13:28:20.449107 umask(000)              = 022
13:28:20.449212 open("/dev/null", O_RDWR) = 8
13:28:20.449309 dup2(8, 0)              = 0
13:28:20.449455 dup2(8, 1)              = 1
13:28:20.449573 close(8)                = 0
13:28:20.449692 open("/run/nginx/nginx.pid", O_RDWR|O_CREAT|O_TRUNC, 0644) = 8
13:28:20.449848 pwrite64(8, "2187n", 5, 0) = 5
13:28:20.449978 close(8)                = 0
13:28:20.450111 dup2(6, 2)              = 2
13:28:20.450218 close(3)                = 0
13:28:20.450376 rt_sigprocmask(SIG_BLOCK, [HUP INT QUIT USR1 USR2 ALRM TERM CHLD WINCH IO], NULL, 8) = 0
13:28:20.450499 socketpair(AF_UNIX, SOCK_STREAM, 0, [3, 8]) = 0
13:28:20.450603 ioctl(3, FIONBIO, [1])  = 0
13:28:20.450696 ioctl(8, FIONBIO, [1])  = 0
13:28:20.450830 ioctl(3, FIOASYNC, [1]) = 0
13:28:20.450964 fcntl(3, F_SETOWN, 2187) = 0
13:28:20.451079 fcntl(3, F_SETFD, FD_CLOEXEC) = 0
13:28:20.451148 fcntl(8, F_SETFD, FD_CLOEXEC) = 0
13:28:20.451244 rt_sigprocmask(SIG_BLOCK, ~[], [HUP INT QUIT USR1 USR2 ALRM TERM CHLD WINCH IO], 8) = 0
13:28:20.451379 fork(strace: Process 2188 attached
 <unfinished ...>
[pid  2188] 13:28:20.451596 gettid( <unfinished ...>
[pid  2187] 13:28:20.451615 <... fork resumed> ) = 2188
[pid  2187] 13:28:20.451727 rt_sigprocmask(SIG_SETMASK, [HUP INT QUIT USR1 USR2 ALRM TERM CHLD WINCH IO],  <unfinished ...>
[pid  2188] 13:28:20.451754 <... gettid resumed> ) = 2188
[pid  2187] 13:28:20.451774 <... rt_sigprocmask resumed> NULL, 8) = 0
[pid  2188] 13:28:20.451942 rt_sigprocmask(SIG_SETMASK, [HUP INT QUIT USR1 USR2 ALRM TERM CHLD WINCH IO],  <unfinished ...>
[pid  2187] 13:28:20.451969 rt_sigsuspend([], 8 <unfinished ...>
[pid  2188] 13:28:20.451985 <... rt_sigprocmask resumed> NULL, 8) = 0
[pid  2188] 13:28:20.452053 getpid()    = 2188
[pid  2188] 13:28:20.452330 rt_sigprocmask(SIG_BLOCK, ~[RTMIN RT_1 RT_2], [HUP INT QUIT USR1 USR2 ALRM TERM CHLD WINCH IO], 8) = 0
[pid  2188] 13:28:20.452621 rt_sigprocmask(SIG_BLOCK, ~[], NULL, 8) = 0
[pid  2188] 13:28:20.452893 prlimit64(0, RLIMIT_NOFILE, {rlim_cur=8*1024, rlim_max=8*1024}, NULL) = 0
[pid  2188] 13:28:20.453075 futex(0x74ae1cf95064, FUTEX_WAKE_PRIVATE, 2147483647) = 0
[pid  2188] 13:28:20.453279 rt_sigprocmask(SIG_SETMASK, [HUP INT QUIT USR1 USR2 ALRM TERM CHLD WINCH IO], NULL, 8) = 0
[pid  2188] 13:28:20.453487 geteuid()   = 0
[pid  2188] 13:28:20.453667 rt_sigprocmask(SIG_BLOCK, ~[RTMIN RT_1 RT_2], [HUP INT QUIT USR1 USR2 ALRM TERM CHLD WINCH IO], 8) = 0
[pid  2188] 13:28:20.453861 rt_sigprocmask(SIG_BLOCK, ~[], NULL, 8) = 0
[pid  2188] 13:28:20.454091 setgid(103) = 0
[pid  2188] 13:28:20.454335 futex(0x74ae1cf95064, FUTEX_WAKE_PRIVATE, 2147483647) = 0
[pid  2188] 13:28:20.454583 rt_sigprocmask(SIG_SETMASK, [HUP INT QUIT USR1 USR2 ALRM TERM CHLD WINCH IO], NULL, 8) = 0
[pid  2188] 13:28:20.454822 socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0) = 9
[pid  2188] 13:28:20.455183 connect(9, {sa_family=AF_UNIX, sun_path="/var/run/nscd/socket"}, 24) = -1 ENOENT (No such file or directory)
[pid  2188] 13:28:20.455537 close(9)    = 0
[pid  2188] 13:28:20.455800 open("/etc/group", O_RDONLY|O_CLOEXEC) = 9
[pid  2188] 13:28:20.456030 fcntl(9, F_SETFD, FD_CLOEXEC) = 0
[pid  2188] 13:28:20.456331 fcntl(9, F_SETFD, FD_CLOEXEC) = 0
[pid  2188] 13:28:20.456544 readv(9, [{iov_base="", iov_len=0}, {iov_base="root:x:0:rootnbin:x:1:root,bin,d"..., iov_len=1024}], 2) = 776
[pid  2188] 13:28:20.456799 readv(9, [{iov_base="", iov_len=0}, {iov_base="", iov_len=1024}], 2) = 0
[pid  2188] 13:28:20.456956 close(9)    = 0
[pid  2188] 13:28:20.457134 setgroups(3, [103, 82, 103]) = 0
[pid  2188] 13:28:20.457365 rt_sigprocmask(SIG_BLOCK, ~[RTMIN RT_1 RT_2], [HUP INT QUIT USR1 USR2 ALRM TERM CHLD WINCH IO], 8) = 0
[pid  2188] 13:28:20.457534 rt_sigprocmask(SIG_BLOCK, ~[], NULL, 8) = 0
[pid  2188] 13:28:20.457818 setuid(102) = 0
[pid  2188] 13:28:20.457990 futex(0x74ae1cf95064, FUTEX_WAKE_PRIVATE, 2147483647) = 0
[pid  2188] 13:28:20.458159 rt_sigprocmask(SIG_SETMASK, [HUP INT QUIT USR1 USR2 ALRM TERM CHLD WINCH IO], NULL, 8) = 0
[pid  2188] 13:28:20.458378 prctl(PR_SET_DUMPABLE, SUID_DUMP_USER) = 0
[pid  2188] 13:28:20.458598 chdir("/var/www") = 0
[pid  2188] 13:28:20.458868 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
[pid  2188] 13:28:20.459703 epoll_create1(0) = 9
[pid  2188] 13:28:20.459994 eventfd2(0, 0) = 10
[pid  2188] 13:28:20.460340 epoll_ctl(9, EPOLL_CTL_ADD, 10, {EPOLLIN|EPOLLET, {u32=1109208384, u64=14363479846208}}) = 0
[pid  2188] 13:28:20.460600 eventfd2(0, 0) = 11
[pid  2188] 13:28:20.460878 ioctl(11, FIONBIO, [1]) = 0
[pid  2188] 13:28:20.461043 io_setup(32, [0x74ae1ce79000]) = 0
[pid  2188] 13:28:20.461389 epoll_ctl(9, EPOLL_CTL_ADD, 11, {EPOLLIN|EPOLLET, {u32=1109208032, u64=14363479845856}}) = 0
[pid  2188] 13:28:20.461729 socketpair(AF_UNIX, SOCK_STREAM, 0, [12, 13]) = 0
[pid  2188] 13:28:20.462043 epoll_ctl(9, EPOLL_CTL_ADD, 12, {EPOLLIN|EPOLLRDHUP|EPOLLET, {u32=1109208032, u64=14363479845856}}) = 0
[pid  2188] 13:28:20.462255 close(13)   = 0
[pid  2188] 13:28:20.462608 epoll_pwait(9, [{EPOLLIN|EPOLLHUP|EPOLLRDHUP, {u32=1109208032, u64=14363479845856}}], 1, 5000, NULL, 8) = 1
[pid  2188] 13:28:20.462969 close(12)   = 0
[pid  2188] 13:28:20.463325 mmap(NULL, 987136, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x74ae1bfaf000
[pid  2188] 13:28:20.463517 mmap(NULL, 397312, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x74ae1ce18000
[pid  2188] 13:28:20.464039 mmap(NULL, 397312, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x74ae1cdb7000
[pid  2188] 13:28:20.466039 epoll_ctl(9, EPOLL_CTL_ADD, 7, {EPOLLIN|EPOLLRDHUP, {u32=469430304, u64=128291142561824}}) = 0
[pid  2188] 13:28:20.466432 close(3)    = 0
[pid  2188] 13:28:20.466763 epoll_ctl(9, EPOLL_CTL_ADD, 8, {EPOLLIN|EPOLLRDHUP, {u32=469430544, u64=128291142562064}}) = 0
//Eventloop starts here
[pid  2188] 13:28:20.467046 epoll_pwait(9, [{EPOLLIN, {u32=469430304, u64=128291142561824}}], 512, -1, NULL, 8) = 1
[pid  2188] 13:28:34.390021 accept4(7, {sa_family=AF_INET, sin_port=htons(54280), sin_addr=inet_addr("10.0.0.15")}, [112->16], SOCK_NONBLOCK) = 3
[pid  2188] 13:28:34.390110 epoll_ctl(9, EPOLL_CTL_ADD, 3, {EPOLLIN|EPOLLRDHUP|EPOLLET, {u32=469430784, u64=128291142562304}}) = 0
[pid  2188] 13:28:34.390188 epoll_pwait(9, [{EPOLLIN, {u32=469430784, u64=128291142562304}}], 512, 30000, NULL, 8) = 1
[pid  2188] 13:28:34.390245 recvfrom(3, "GET /index.html HTTP/1.0rnHost: "..., 2048, 0, NULL, NULL) = 93
[pid  2188] 13:28:34.390462 writev(3, [{iov_base="HTTP/1.1 200 OKrnServer: nginxrn"..., iov_len=134}], 1) = 134
[pid  2188] 13:28:34.390602 close(3)    = 0

sTrace of the test-code:

localhost:/~# strace -tt -f ./test 
13:31:19.964887 execve("./test", ["./test"], 0x721039661e10 /* 16 vars */) = 0
13:31:20.086769 arch_prctl(ARCH_SET_FS, 0x70311bc79b88) = 0
13:31:20.087599 set_tid_address(0x70311bc79bc0) = 2199
13:31:20.088375 mprotect(0x70311bc76000, 4096, PROT_READ) = 0
13:31:20.088717 mprotect(0x268c786b000, 4096, PROT_READ) = 0
13:31:20.088964 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, IPPROTO_IP) = 3
13:31:20.089232 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
13:31:20.089402 bind(3, {sa_family=AF_INET, sin_port=htons(8081), sin_addr=inet_addr("0.0.0.0")}, 16) = 0
13:31:20.089579 listen(3, 511)          = 0
13:31:20.089797 epoll_create1(EPOLL_CLOEXEC) = 4
13:31:20.090018 epoll_ctl(4, EPOLL_CTL_ADD, 3, {EPOLLIN|EPOLLRDHUP, {u32=3, u64=3}}) = 0
13:31:20.090235 epoll_pwait(4, [{EPOLLIN, {u32=3, u64=3}}], 512, -1, NULL, 8) = 1
13:31:24.078593 accept4(3, NULL, NULL, SOCK_CLOEXEC|SOCK_NONBLOCK) = 5
13:31:24.078847 epoll_ctl(4, EPOLL_CTL_ADD, 5, {EPOLLIN|EPOLLRDHUP|EPOLLET, {u32=5, u64=5}}) = 0
13:31:24.079024 epoll_pwait(4, [{EPOLLIN, {u32=5, u64=5}}], 512, -1, NULL, 8) = 1
13:31:24.079197 recvfrom(5, "GET /index.html HTTP/1.0rnHost: "..., 2048, 0, NULL, NULL) = 93
13:31:24.079407 writev(5, [{iov_base="HTTP/1.0 200 OKnServer: TestnDat"..., iov_len=102}], 1) = 102
13:31:24.079604 close(5)                = 0

Edit:
I did some more traceing … 400000 request from a remote host … still no clue why this happens:

localhost:/~# strace -c -f /usr/sbin/nginx 
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 47.11    0.040309           0    400000           writev
 44.55    0.038115           0    400021           close
  3.11    0.002658           0    400002           accept4
  1.80    0.001538           0    400002           recvfrom
  1.74    0.001486           0    400007           epoll_ctl
  1.69    0.001450           0    400008           epoll_pwait

localhost:/~# strace -c -f ./test 
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 47.90    0.042760           0    400002           writev
 44.27    0.039518           0    400002           close
  3.13    0.002793           0    400002           accept4
  1.80    0.001610           0    400002           recvfrom
  1.57    0.001400           0    400005           epoll_pwait
  1.33    0.001183           0    400003           epoll_ctl


Get this bounty!!!

#StackBounty: #python #performance #neural-network Deep Neural Network in Python

Bounty: 100

I have written a neural network in Python and focused on adaptability and performance. I want to use it to dive deeper into that field. I am far from being an expert in neural networks and the same goes for Python. I do not want to use Tensorflow since I really want to understand how a neural network works.

My questions are:

  • How can I increase the performance? At the moment it takes days to train the network

The code runs on a single core. But since every loop over the batches run independently it can be parallelized.

  • How can I parallelize the loop over the batches?

I found some tutorials on parallel loops in Python but I could not apply it to my problem.

Here is my tested code with some pseudo training data:

from numpy import random, zeros, array, dot
from scipy.special import expit
import time 

def sigma(x):
    return expit(x)

def sigma_prime(x):
    u = expit(x)
    return  u-u*u 

def SGD(I, L, batch_size, eta):

    images = len(L)

    # Pre-activation
    z = [zeros((layer_size[l],1)) for l in range(1,nn_size)]

    # Activations
    a = [zeros((layer_size[l],1)) for l in range(nn_size)]

    # Ground truth      
    y = zeros((images, layer_size[-1]))
    for i in range(images):
        y[i,L[i]] = 1.0

    while (1):

        t0 = time.time()

        # Create random batch
        batch = random.randint(0,images,batch_size)

        dW = [zeros((layer_size[l+1], layer_size[l])) for l in range(nn_size-1)]
        db = [zeros((layer_size[l],1)) for l in range(1, nn_size)]

        for i in batch:        
            # Feedforward
            a[0] = array([I[i]]).T
            for l in range(nn_size-1):
                z[l] = dot(W[l], a[l]) + b[l]
                a[l+1] = sigma(z[l])

            # Backpropagation
            delta = (a[nn_size-1]-array([y[i]]).T) * sigma_prime(z[nn_size-2])
            dW[nn_size-2] += dot(delta, a[nn_size-2].T)
            dW[nn_size-2] += delta.dot(a[nn_size-2].T)
            db[nn_size-2] += delta
            for l in reversed(range(nn_size-2)):
                delta = dot(W[l+1].T, delta) * sigma_prime(z[l])
                dW[l] += dot(delta, a[l].T)
                db[l] += delta

        # Update Weights and Biases
        for l in range(nn_size-1):
            W[l] += - eta * dW[l] / batch_size
            b[l] += - eta * db[l] / batch_size

        print(time.time() - t0)

input_size = 1000
output_size = 10

layer_size = [input_size, 30**2, 30**2, 30**2, output_size]

nn_size = len(layer_size)
layer_size = layer_size

# Weights
W = [random.randn(layer_size[l+1],layer_size[l]) for l in range(nn_size-1)]

# Bias
b = [random.randn(layer_size[l],1) for l in range(1,nn_size)]

# Some random training data with label
size_training_data = 1000
I = random.rand(size_training_data, input_size)
L = random.randint(0,10, input_size)

batch_size = 100
eta = 0.1
SGD(I, L, batch_size, eta)


Get this bounty!!!

#StackBounty: #sql-server #performance #index #index-tuning #deadlock SQL deadlock on nonclustered key caused by two INSERTs and a CHEC…

Bounty: 50

Been struggling with deadlocking on a table during INSERTs. It’s a multi-tenant database and Read Committed Snapshot Isolation (RCSI) is enabled.

Dedlock graph

There is a CHECK CONSTRAINT upon INSERT to enforce logic around double bookings which executes a Scalar Valued Function and checks for a result of 0. This constraint and looks up the same table with a READCOMMITTEDLOCK hint to check for violations of the logic where the ID (PK/clustered index) doesn’t equal the ID of the newly inserted row.

The constraint does an INDEX SEEK on the index causing the deadlock: idx_report_foobar.

Any assistance would be greatly appreciated.

Here is the XML (which has been adjusted to remove some of the logic and names of table fields which are in the database):

<deadlock>
 <victim-list>
  <victimProcess id="process91591c108" />
 </victim-list>
 <process-list>
  <process id="process91591c108" taskpriority="0" logused="1328" waitresource="KEY: 9:72057594095861760 (c2e966d5eb6a)" waittime="3046" ownerId="2628292921" transactionname="user_transaction" lasttranstarted="2018-03-09T14:24:13.820" XDES="0x708a80d80" lockMode="S" schedulerid="10" kpid="8964" status="suspended" spid="119" sbid="2" ecid="0" priority="0" trancount="2" lastbatchstarted="2018-03-09T14:24:13.823" lastbatchcompleted="2018-03-09T14:24:13.820" lastattention="1900-01-01T00:00:00.820" clientapp=".Net SqlClient Data Provider" hostname="SERVERNAMEHERE" hostpid="33672" loginname="DOMAINUSERHERE" isolationlevel="read committed (2)" xactid="2628292921" currentdb="9" lockTimeout="4294967295" clientoption1="671088672" clientoption2="128056">
   <executionStack>
    <frame procname="mydb.dbo.CheckForDoubleBookings" line="12" stmtstart="920" stmtend="3200" sqlhandle="0x0300090018ef9b72531bea009ea8000000000000000000000000000000000000000000000000000000000000">
IF EXISTS (SELECT * 
                 FROM   dbo.bookings a WITH (READCOMMITTEDLOCK)
                 WHERE  a.id &lt;&gt; @id 
                        AND a.userID = @userID 
                        AND @bookingStart &lt; a.bookingEnd 
                        AND a.bookingStart &lt; @bookingEnd
                        AND a.eventID = @eventID
    </frame>
    <frame procname="adhoc" line="1" stmtstart="288" stmtend="922" sqlhandle="0x020000005ed9af11c02db2af69df1d5fb6d1adb0e4812afb0000000000000000000000000000000000000000">
unknown    </frame>
    <frame procname="unknown" line="1" sqlhandle="0x0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000">
unknown    </frame>
   </executionStack>
   <inputbuf>
(@0 datetime2(7),@1 datetime2(7),@2 int,@3 int,@4 int,@5 int,@6 int,@7 nvarchar(4000),@8 datetime2(7),@9 nvarchar(50),@10 int,@11 nvarchar(255))INSERT [dbo].[bookings]([bookingStart], [bookingEnd], [userID], [eventID], [TypeId], [Notes], [Timestamp], [AddedById])
VALUES (@0, @1, @2, @3, @4, @5, @6, @7, @8, NULL, @9, @10, @11, NULL, NULL)
SELECT [Id]
FROM [dbo].[bookings]
WHERE @@ROWCOUNT &gt; 0 AND [Id] = scope_identity()   </inputbuf>
  </process>
  <process id="processca27768c8" taskpriority="0" logused="1328" waitresource="KEY: 9:72057594095861760 (3ba50d420e66)" waittime="3048" ownerId="2628280537" transactionname="user_transaction" lasttranstarted="2018-03-09T14:24:04.063" XDES="0xa555403b0" lockMode="S" schedulerid="6" kpid="12776" status="suspended" spid="124" sbid="2" ecid="0" priority="0" trancount="2" lastbatchstarted="2018-03-09T14:24:04.070" lastbatchcompleted="2018-03-09T14:24:04.063" lastattention="1900-01-01T00:00:00.063" clientapp=".Net SqlClient Data Provider" hostname="SERVERNAMEHERE" hostpid="33672" loginname="DOMAINUSERHERE" isolationlevel="read committed (2)" xactid="2628280537" currentdb="9" lockTimeout="4294967295" clientoption1="671088672" clientoption2="128056">
   <executionStack>
    <frame procname="mydb.dbo.CheckForDoubleBookings" line="12" stmtstart="920" stmtend="3200" sqlhandle="0x0300090018ef9b72531bea009ea8000000000000000000000000000000000000000000000000000000000000">
IF EXISTS (SELECT * 
                 FROM   dbo.bookings a WITH (READCOMMITTEDLOCK)
                 WHERE  a.id &lt;&gt; @id 
                        AND a.userID = @userID 
                        AND @bookingStart &lt; a.bookingEnd 
                        AND a.bookingStart &lt; @bookingEnd
                        AND a.eventID = @eventID
    </frame>
    <frame procname="adhoc" line="1" stmtstart="288" stmtend="922" sqlhandle="0x020000005ed9af11c02db2af69df1d5fb6d1adb0e4812afb0000000000000000000000000000000000000000">
unknown    </frame>
    <frame procname="unknown" line="1" sqlhandle="0x0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000">
unknown    </frame>
   </executionStack>
   <inputbuf>
(@0 datetime2(7),@1 datetime2(7),@2 int,@3 int,@4 int,@5 int,@6 int,@7 nvarchar(4000),@8 datetime2(7),@9 nvarchar(50),@10 int,@11 nvarchar(255))INSERT [dbo].[bookings]([bookingStart], [bookingEnd], [userID], [eventID], [TypeId], [Notes], [Timestamp], [AddedById])
VALUES (@0, @1, @2, @3, @4, @5, @6, @7, @8, NULL, @9, @10, @11, NULL, NULL)
SELECT [Id]
FROM [dbo].[bookings]
WHERE @@ROWCOUNT &gt; 0 AND [Id] = scope_identity()   </inputbuf>
  </process>
 </process-list>
 <resource-list>
  <keylock hobtid="72057594095861760" dbid="9" objectname="mydb.dbo.bookings" indexname="idx_report_foobar" id="locke83fdbe80" mode="X" associatedObjectId="72057594095861760">
   <owner-list>
    <owner id="processca27768c8" mode="X" />
   </owner-list>
   <waiter-list>
    <waiter id="process91591c108" mode="S" requestType="wait" />
   </waiter-list>
  </keylock>
  <keylock hobtid="72057594095861760" dbid="9" objectname="mydb.dbo.bookings" indexname="idx_report_foobar" id="lock7fdb48480" mode="X" associatedObjectId="72057594095861760">
   <owner-list>
    <owner id="process91591c108" mode="X" />
   </owner-list>
   <waiter-list>
    <waiter id="processca27768c8" mode="S" requestType="wait" />
   </waiter-list>
  </keylock>
 </resource-list>
</deadlock>

The index:

CREATE NONCLUSTERED INDEX [idx_report_foobar] ON [dbo].[bookings]
(
    [eventID] ASC
)
INCLUDE (   [bookingStart],
    [bookingEnd],
    [userID]) WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, SORT_IN_TEMPDB = OFF, DROP_EXISTING = OFF, ONLINE = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, FILLFACTOR = 80)
GO


Get this bounty!!!

#StackBounty: #python #performance #python-3.x Brute-force Hash Cracker

Bounty: 50

I made a hash cracker in Python (for purely educational purposes), but it’s really slow (~120 seconds for a 4 character string). How could I speed it up?

Current optimizations and explanations:

  • Closures in CharSet.get_advance: These are faster than attribute lookups.
  • iter in PasswordCracker.crack: This moves the loop into C.
  • CharSet.next as an array.array: Faster than a dict.

Possible future optimizations:

  • advance is kind of slow, but I’m not sure how to speed it up.

Code:

import hashlib
from string import printable
from time import time
import itertools
from array import array

ENCODING = "ascii" # utf-8 for unicode support

class CharSet():
  def __init__(self, chars):
    chars = to_bytes(chars)
    self.chars = set(chars)
    self.first = chars[0]
    self.last = chars[-1]
    self.next = array("B", [0] * 256)
    for char, next_char in zip(chars, chars[1:]):
      self.next[char] = next_char

  def update_chars(self, new_chars):
    new_chars = to_bytes(new_chars)
    new_chars = set(new_chars) - self.chars
    if new_chars: # if theres anything new
      self.chars |= new_chars
      new_chars = list(new_chars)
      self.next[self.last] = new_chars[0]
      self.last = new_chars[-1]
      for char, next_char in zip(new_chars, new_chars[1:]):
        self.next[char] = next_char

  def get_advance(self, arr, hash_):
    first = self.first
    last = self.last
    next_ = self.next
    def advance():
      for ind, byte in enumerate(arr):
        if byte == last:
          arr[ind] = first
        else:
          arr[ind] = next_[byte]
          return hash_(arr)

      arr.append(first)
      return hash_(arr)

    return advance

class PasswordCracker():
  def __init__(self, hash_, chars=None):
    self.hash = hash_
    if chars is None:
      chars = printable
    self.char_set = CharSet(chars)

  def update_chars(self, string):
    self.char_set.update_chars(string)

  def crack(self, hashed):
    arr = bytearray()
    advance = self.char_set.get_advance(arr, self.hash)
    for _ in iter(advance, hashed):
      pass
    return arr

def to_bytes(string):
  if isinstance(string, str):
    return bytearray(string, ENCODING)
  elif isinstance(string, (bytes, bytearray)):
    return string
  else:
    raise TypeError(f"Cannot convert {string} to bytes")

def get_hasher(hash_):
  def hasher(bytes):
    return hash_(bytes).digest()

  return hasher

md5 = get_hasher(hashlib.md5)

cracker = PasswordCracker(md5)

password = input("Enter password: ")

cracker.update_chars(password)
password = md5(to_bytes(password))

start = time()
cracked = cracker.crack(password)
end = time()
print(f"Password cracked: {cracked.decode(ENCODING)}")
print(f"Time: {end - start} seconds.")

Profiling results (with password "pww"):

      1333313 function calls in 1.500 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    1.500    1.500 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 main.py:31(get_advance)
   333326    0.394    0.000    1.376    0.000 main.py:35(advance)
        1    0.124    0.124    1.500    1.500 main.py:58(crack)
   333326    0.311    0.000    0.982    0.000 main.py:74(hasher)
   333326    0.265    0.000    0.265    0.000 {built-in method _hashlib.openssl_md5}
        1    0.000    0.000    1.500    1.500 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 {built-in method builtins.iter}
        3    0.000    0.000    0.000    0.000 {method 'append' of 'bytearray' objects}
   333326    0.405    0.000    0.405    0.000 {method 'digest' of '_hashlib.HASH' objects}

Profiling results (with password "pwww", extra "w"):

         133333314 function calls in 190.800 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000  190.799  190.799 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 main.py:31(get_advance)
 33333326   65.652    0.000  169.782    0.000 main.py:35(advance)
        1   21.017   21.017  190.799  190.799 main.py:58(crack)
 33333326   40.640    0.000  104.130    0.000 main.py:74(hasher)
 33333326   27.957    0.000   27.957    0.000 {built-in method _hashlib.openssl_md5}
        1    0.000    0.000  190.800  190.800 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 {built-in method builtins.iter}
        4    0.000    0.000    0.000    0.000 {method 'append' of 'bytearray' objects}
 33333326   35.533    0.000   35.533    0.000 {method 'digest' of '_hashlib.HASH' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


Get this bounty!!!

#StackBounty: #python #performance Brute-force Hash Cracker

Bounty: 50

I made a hash cracker in Python (for purely educational purposes), but it’s really slow (~120 seconds for a 4 character string). How could I speed it up?

Current optimizations and explanations:

  • Closures in CharSet.get_advance: These are faster than attribute lookups.
  • iter in PasswordCracker.crack: This moves the loop into C.
  • CharSet.next as an array.array: Faster than a dict.

Possible future optimizations:

  • advance is kind of slow, but I’m not sure how to speed it up.

Code:

import hashlib
from string import printable
from time import time
import itertools
from array import array

ENCODING = "ascii" # utf-8 for unicode support

class CharSet():
  def __init__(self, chars):
    chars = to_bytes(chars)
    self.chars = set(chars)
    self.first = chars[0]
    self.last = chars[-1]
    self.next = array("B", [0] * 256)
    for char, next_char in zip(chars, chars[1:]):
      self.next[char] = next_char

  def update_chars(self, new_chars):
    new_chars = to_bytes(new_chars)
    new_chars = set(new_chars) - self.chars
    if new_chars: # if theres anything new
      self.chars |= new_chars
      new_chars = list(new_chars)
      self.next[self.last] = new_chars[0]
      self.last = new_chars[-1]
      for char, next_char in zip(new_chars, new_chars[1:]):
        self.next[char] = next_char

  def get_advance(self, arr, hash_):
    first = self.first
    last = self.last
    next_ = self.next
    def advance():
      for ind, byte in enumerate(arr):
        if byte == last:
          arr[ind] = first
        else:
          arr[ind] = next_[byte]
          return hash_(arr)

      arr.append(first)
      return hash_(arr)

    return advance

class PasswordCracker():
  def __init__(self, hash_, chars=None):
    self.hash = hash_
    if chars is None:
      chars = printable
    self.char_set = CharSet(chars)

  def update_chars(self, string):
    self.char_set.update_chars(string)

  def crack(self, hashed):
    arr = bytearray()
    advance = self.char_set.get_advance(arr, self.hash)
    for _ in iter(advance, hashed):
      pass
    return arr

def to_bytes(string):
  if isinstance(string, str):
    return bytearray(string, ENCODING)
  elif isinstance(string, (bytes, bytearray)):
    return string
  else:
    raise TypeError(f"Cannot convert {string} to bytes")

def get_hasher(hash_):
  def hasher(bytes):
    return hash_(bytes).digest()

  return hasher

md5 = get_hasher(hashlib.md5)

cracker = PasswordCracker(md5)

password = input("Enter password: ")

cracker.update_chars(password)
password = md5(to_bytes(password))

start = time()
cracked = cracker.crack(password)
end = time()
print(f"Password cracked: {cracked.decode(ENCODING)}")
print(f"Time: {end - start} seconds.")

Profiling results (with password "pww"):

      1333313 function calls in 1.500 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    1.500    1.500 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 main.py:31(get_advance)
   333326    0.394    0.000    1.376    0.000 main.py:35(advance)
        1    0.124    0.124    1.500    1.500 main.py:58(crack)
   333326    0.311    0.000    0.982    0.000 main.py:74(hasher)
   333326    0.265    0.000    0.265    0.000 {built-in method _hashlib.openssl_md5}
        1    0.000    0.000    1.500    1.500 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 {built-in method builtins.iter}
        3    0.000    0.000    0.000    0.000 {method 'append' of 'bytearray' objects}
   333326    0.405    0.000    0.405    0.000 {method 'digest' of '_hashlib.HASH' objects}

Profiling results (with password "pwww", extra "w"):

         133333314 function calls in 190.800 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000  190.799  190.799 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 main.py:31(get_advance)
 33333326   65.652    0.000  169.782    0.000 main.py:35(advance)
        1   21.017   21.017  190.799  190.799 main.py:58(crack)
 33333326   40.640    0.000  104.130    0.000 main.py:74(hasher)
 33333326   27.957    0.000   27.957    0.000 {built-in method _hashlib.openssl_md5}
        1    0.000    0.000  190.800  190.800 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 {built-in method builtins.iter}
        4    0.000    0.000    0.000    0.000 {method 'append' of 'bytearray' objects}
 33333326   35.533    0.000   35.533    0.000 {method 'digest' of '_hashlib.HASH' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


Get this bounty!!!

#StackBounty: #performance #haskell #recursion #optimization #benchmarking Why do these fixpoint cata / ana morphism definitions outper…

Bounty: 100

Consider these definitions from a previous question:

type Algebra f a = f a -> a

cata :: Functor f => Algebra f b -> Fix f -> b
cata alg = alg . fmap (cata alg) . unFix

fixcata :: Functor f => Algebra f b -> Fix f -> b
fixcata alg = fix $ f -> alg . fmap f . unFix

type CoAlgebra f a = a -> f a

ana :: Functor f => CoAlgebra f a -> a -> Fix f
ana coalg = Fix . fmap (ana coalg) . coalg

fixana :: Functor f => CoAlgebra f a -> a -> Fix f
fixana coalg = fix $ f -> Fix . fmap f . coalg

I ran some benchmarks and the results are surprising me. criterion reports something like a tenfold speedup, specifically when O2 is enabled. I wonder what causes such massive improvement, and begin to seriously doubt my benchmarking abilities.

This is the exact criterion code I use:

smallWord, largeWord :: Word
smallWord = 2^10
largeWord = 2^20

shortEnv, longEnv :: Fix Maybe
shortEnv = ana coAlg smallWord
longEnv = ana coAlg largeWord

benchCata = nf (cata alg)
benchFixcata = nf (fixcata alg)

benchAna = nf (ana coAlg)
benchFixana = nf (fixana coAlg)

main = defaultMain
    [ bgroup "cata"
        [ bgroup "short input"
            [ env (return shortEnv) $ x -> bench "cata"    (benchCata x)
            , env (return shortEnv) $ x -> bench "fixcata" (benchFixcata x)
            ]
        , bgroup "long input"
            [ env (return longEnv) $ x -> bench "cata"    (benchCata x)
            , env (return longEnv) $ x -> bench "fixcata" (benchFixcata x)
            ]
        ]
    , bgroup "ana"
        [ bgroup "small word"
            [ bench "ana" $ benchAna smallWord
            , bench "fixana" $ benchFixana smallWord
            ]
        , bgroup "large word"
            [ bench "ana" $ benchAna largeWord
            , bench "fixana" $ benchFixana largeWord
            ]
        ]
    ]

And some auxiliary code:

alg :: Algebra Maybe Word
alg Nothing = 0
alg (Just x) = succ x

coAlg :: CoAlgebra Maybe Word
coAlg 0 = Nothing
coAlg x = Just (pred x)

Compiled with O0, the digits are pretty even. With O2, fix~ functions seem to outperform the plain ones:

benchmarking cata/short input/cata
time                 31.67 μs   (31.10 μs .. 32.26 μs)
                     0.999 R²   (0.998 R² .. 1.000 R²)
mean                 31.20 μs   (31.05 μs .. 31.46 μs)
std dev              633.9 ns   (385.3 ns .. 1.029 μs)
variance introduced by outliers: 18% (moderately inflated)

benchmarking cata/short input/fixcata
time                 2.422 μs   (2.407 μs .. 2.440 μs)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 2.399 μs   (2.388 μs .. 2.410 μs)
std dev              37.12 ns   (31.44 ns .. 47.06 ns)
variance introduced by outliers: 14% (moderately inflated)

I would appreciate if someone can confirm or spot a flaw.

*I compiled things with ghc 8.2.2 on this occasion.)


postscriptum

This post from back in 2012 elaborates on the performance of fix in quite a fine detail. (Thanks to @chi for the link.)


Get this bounty!!!

#StackBounty: #machine-learning #predictive-models #performance #sensitivity #specificity Assessing correlated predictions

Bounty: 50

Let’s assume we have a prediction algorithm (if it helps, imagine it’s using some boosted tree method) that does daily predictions for whether some event will happen to a unit (e.g. a machine that might break down, a patient that might get a problematic medical event etc.) during the next week. We can only get training and test data for a low number of units (e.g. 400 + 100 or so) across a limited period of time (e.g. half a year).

How would one assess prediction performance (e.g. sensitivity/specificity/AuROC etc.) of some algorithm (e.g. some tree method) in this setting on test data? Presumably there is a potential issue in that the prediction intervals overlap and even non-overlapping intervals for the same unit are somewhat correlated (i.e. if I can predict well for a particular unit due to its specific characteristics, I may do well on all time intervals for that unit, but this does not mean the algorithm would generalize well).

Perhaps I have just not hit on the right search terms, but I have failed to find anything published on this topic (surely someone has written about this before?!). Any pointers to any literature?

My initial thought was that perhaps naively calculated (i.e. just treating this as independent observations and predictions) point estimates of sensitivity/specificity might be fine, but that any problem would be more with the uncertainty around these? If so, could one just bootstrap (drawing whole units with replacement) and get decent assessments that way?


Get this bounty!!!

#StackBounty: #python #performance #beginner #numpy #markov-chain Solve the phase state between two haplotype blocks using markov trans…

Bounty: 50

I have spent about more than a year in python, but I come from biology background. I should say I have got some understanding of for-loop and nested-for-loop to workout the solution to the problem I have had. But, I suppose that there are better and comprehensive way of solving problem.

Below is the code I wrote to solve the haplotype phase state between two haplotype blocks.

Here is a short snipped of my data:

chr  pos       ms01e_PI  ms01e_PG_al  ms02g_PI  ms02g_PG_al  ms03g_PI  ms03g_PG_al  ms04h_PI  ms04h_PG_al  ms05h_PI  ms05h_PG_al  ms06h_PI  ms06h_PG_al
2    15881764  4         C|T          6         C|T          7         T|T          7         T|T          7         C|T          7         C|T
2    15881767  4         C|C          6         T|C          7         C|C          7         C|C          7         T|C          7         C|C
2    15881989  4         C|C          6         A|C          7         C|C          7         C|C          7         A|T          7         A|C
2    15882091  4         G|T          6         G|T          7         T|A          7         A|A          7         A|T          7         A|C
2    15882451  4         C|T          4         T|C          7         T|T          7         T|T          7         C|T          7         C|A
2    15882454  4         C|T          4         T|C          7         T|T          7         T|T          7         C|T          7         C|T
2    15882493  4         C|T          4         T|C          7         T|T          7         T|T          7         C|T          7         C|T
2    15882505  4         A|T          4         T|A          7         T|T          7         T|T          7         A|C          7         A|T

Problem:

In the given data I want to solve the phase state of haplotype for the sample ms02g. The suffix PI represents the phased block (i.e all data within that block is phased).

For sample ms02g C-T-A-G is phased (PI level 6), so is T-C-C-T. Similarly data within PI level 4 is also phased. But, since the haplotype is broken in two levels, we don’t know which phase from level-6 goes with which phase of level-4. Phase to be solved - sample ms02g

ms02g_PI  ms02g_PG_al
6         C|T
6         T|C
6         A|C
6         G|T
×——————————×—————> Break Point
4         T|C
4         T|C
4         T|C
4         T|A

Since, all other samples are completely phased I can run a markov-chain transition probabilities to solve the phase state. To the human eye/mind you can clearly see and say that left part of ms02g (level 6, i.e C-T-A-G is more likely to go with right block of level 4 C-C-C-A).

Calculation:

Step 01:

I prepare required haplotype configuration:

  • The top haplotype-PI is block01 and bottom is block-02.
  • The left phased haplotype within each block is Hap-A and the right is Hap-B.

So, there are 2 haplotype configurations:

  • Block01-HapA with Block02-HapA, so B01-HapB with B02-HapB
  • Block01-HapA with Block02-HapB, so B01-HapB with B02-HapA

I count the number of transition for each nucleotide of PI-6 to each nucleotide of PI-4 for each haplotype configuration.
possible transition from PI-6 to PI-4

  Possible
  transitions    ms02g_PG_al
    │       ┌┬┬┬ C|T
    │       ││││ T|C
    │       ││││ A|C
    │       ││││ G|T
    └─────> ││││  ×—————> Break Point
            │││└ T|C
            ││└─ T|C
            │└── T|C
            └─── T|A
  • and multiply the transition probabilities for first nucleotide in PI-6 to all nucleotides of PI-4. Then similarly multiply the transition probability for 2nd nucleotide of PI-6 to all nucleotides in PI-4.

  • then I sum the transition probabilities from one level to another.

enter image description here

  Possible
  transitions    ms02g_PG_al
    │       ┌┬┬┬ C|T 
    │       ││││ T|C | Block 1
    │       ││││ A|C |
    │       ││││ G|T /
    └─────> ││││  ×—————> Break Point
            │││└ T|C 
            ││└─ T|C |
            │└── T|C | Block 2
            └─── T|A /
                 ↓ ↓
             Hap A Hap B

calculating likelyhood

  Block-1-Hap-A with Block-2-Hap-B    vs.    Block-1-Hap-A with Block-2-Hap-A

  C|C × C|C × C|C × A|C = 0.58612            T|C × T|C × T|C × T|C = 0.000244
+ C|T × C|T × C|T × A|T = 0.3164           + T|T × T|T × T|T × T|T = 0.00391
+ C|A × C|A × C|A × A|A = 0.482            + T|A × T|A × T|A × T|A = 0.0007716
+ C|G × C|G × C|G × A|G = 0.3164           + T|G × T|G × T|G × T|G = 0.00391
                          ———————                                    —————————
                    Avg = 0.42523                              Avg = 0.002207

        Likelyhood Ratio = 0.42523 / 0.002207 = 192.67

I have the following working code with python3:
I think the transition probs can be mostly optimized using numpy, or using a markov-chain package. But, I was not able to work them out in the way I desired. Also, it took me sometime to get to what I wanted to do.

Any suggestions with explanation. Thanks.

#!/home/bin/python
from __future__ import division

import collections
from itertools import product
from itertools import islice
import itertools
import csv
from pprint import pprint


''' function to count the number of transitions '''
def sum_Probs(pX_Y, pX):
    try:
        return float(pX_Y / pX)
    except ZeroDivisionError:
        return 0


with open("HaploBlock_Updated.txt", 'w') as f:

    ''' Write header (part 01): This is just a front part of the header. The rear part of the header is
    update dynamically depending upon the number of samples '''
    f.write('t'.join(['chr', 'pos', 'ms02g_PI', 'ms02g_PG_al', 'n']))

    with open('HaploBlock_for_test-forHMMtoy02.txt') as Phased_Data:

        ''' Note: Create a dictionary to store required data. The Dict should contain information about
         two adjacent haplotype blocks that needs extending. In this example I want to extend the
         haplotypes for sample ms02g which has two blocks 6 and 4. So, I read the PI and PG value
         for this sample. Also, data should store with some unique keys. Some keys are: chr, pos,
         sample (PI, PG within sample), possible haplotypes ... etc .. '''


        Phased_Dict = csv.DictReader(Phased_Data, delimiter='t')
        grouped = itertools.groupby(Phased_Dict, key=lambda x: x['ms02g_PI'])


        ''' Function to read the data as blocks (based on PI values of ms02g) at a time'''
        def accumulate(data):
            acc = collections.OrderedDict()
            for d in data:
                for k, v in d.items():
                    acc.setdefault(k, []).append(v)
            return acc


        ''' Store data as keys,values '''
        grouped_data = collections.OrderedDict()
        for k, g in grouped:
            grouped_data[k] = accumulate(g)


        ''' Clear memory '''
        del Phased_Dict
        del grouped

        ''' Iterate over two Hap Blocks at once. This is done to obtain all possible Haplotype
        configurations between two blocks. The (keys,values) for first block is represented as
        k1,v2 and for the later block as k2,v2. '''

        # Empty list for storing haplotypes from each block
        hap_Block1_A = [];
        hap_Block1_B = []
        hap_Block2_A = [];
        hap_Block2_B = []

        # Create empty list for possible haplotype configurations from above block
        hapB1A_hapB2A = []
        hapB1B_hapB2B = []

        ''' list of all available samples (index, value) '''
        sample_list = [('ms01e_PI', 'ms01e_PG_al'), ('ms02g_PI', 'ms02g_PG_al'),
                       ('ms03g_PI', 'ms03g_PG_al'), ('ms04h_PI', 'ms04h_PG_al'),
                       ('ms05h_PI', 'ms05h_PG_al'), ('ms06h_PI', 'ms06h_PG_al')]


        ''' read data from two consecutive blocks at a time '''
        for (k1, v1), (k2, v2) in zip(grouped_data.items(), islice(grouped_data.items(), 1, None)):
            ''' skip if one of the keys has no values'''
            if k1 == '.' or k2 == '.':
                continue


            ''' iterate over the first Haplotype Block, i.e the k1 block.
            The nucleotides in the left of the phased SNPs are called Block01-haplotype-A,
            and similarly on the right as Block01-haplotype-B. '''
            hap_Block1_A = [x.split('|')[0] for x in v1['ms02g_PG_al']] # the left haplotype of Block01
            hap_Block1_B = [x.split('|')[1] for x in v1['ms02g_PG_al']]


            ''' iterate over the second Haplotype Block,
            i.e the k2 block '''
            hap_Block2_A = [x.split('|')[0] for x in v2['ms02g_PG_al']]
            hap_Block2_B = [x.split('|')[1] for x in v2['ms02g_PG_al']]



            ''' Now, we have to start to solve the possible haplotype configuration.
            Possible haplotype Configurations will be, Either :
            1) Block01-haplotype-A phased with Block02-haplotype-A,
                creating -> hapB1A-hapB2A, hapB1B-hapB2B
            Or,
            2) Block01-haplotype-A phased with Block02-haplotype-B
                creating -> hapB1A-hapB2B, hapB1B-hapB2A '''

            ''' First possible configuration '''
            hapB1A_hapB2A = [hap_Block1_A, hap_Block2_A]
            hapB1B_hapB2B = [hap_Block1_B, hap_Block2_B]

            ''' Second Possible Configuration '''
            hapB1A_hapB2B = [hap_Block1_A, hap_Block2_B]
            hapB1B_hapB2A = [hap_Block1_B, hap_Block2_A]


            print('nStarting MarkovChains')
            ''' Now, start preping the first order markov transition matrix
             for the observed haplotypes between two blocks.'''


            ''' To store the sum-values of the product of the transition probabilities.
            These sum are added as the product-of-transition is retured by nested for-loop;
            from "for m in range(....)" '''

            Sum_Of_Product_of_Transition_probabilities_hapB1A_hapB2A = 
                sumOf_PT_hapB1A_B2A = 0
            Sum_Of_Product_of_Transition_probabilities_hapB1B_hapB2B = 
                sumOf_PT_hapB1B_B2B = 0

            Sum_Of_Product_of_Transition_probabilities_hapB1A_hapB2B = 
                sumOf_PT_hapB1A_B2B = 0
            Sum_Of_Product_of_Transition_probabilities_hapB1B_hapB2A = 
                sumOf_PT_hapB1B_B2A = 0


            for n in range(len(v1['ms02g_PI'])):  # n-ranges from block01

                ''' Set the probabilities of each nucleotides at Zero. They are updated for each level
                 of "n" after reading the number of each nucleotides at that position. '''
                pA = 0; pT = 0; pG = 0; pC = 0
                # nucleotide_prob = [0., 0., 0., 0.] # or store as numpy matrix

                # Also storing these values as Dict
                # probably can be improved
                nucleotide_prob_dict = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
                print('prob as Dict: ', nucleotide_prob_dict)


                ''' for storing the product-values of the transition probabilities.
                These are updated for each level of "n" paired with each level of "m". '''
                product_of_transition_Probs_hapB1AB2A = POTP_hapB1AB2A = 1
                product_of_transition_Probs_hapB1BB2B = POTP_hapB1BB2B = 1

                product_of_transition_Probs_hapB1AB2B = POTP_hapB1AB2B = 1
                product_of_transition_Probs_hapB1BB2A = POTP_hapB1BB2A = 1



                ''' Now, we read each level of "m" to compute the transition from each level of "n"
                to each level of "m". '''
                for m in range(len(v2['ms02g_PI'])): # m-ranges from block02
                    # transition for each level of 0-to-n level of V1 to 0-to-m level of V2
                    ''' set transition probabilities at Zero for each possible transition '''
                    pA_A = 0; pA_T = 0; pA_G = 0; pA_C = 0
                    pT_A = 0; pT_T = 0; pT_G = 0; pT_C = 0
                    pG_A = 0; pG_T = 0; pG_G = 0; pG_C = 0
                    pC_A = 0; pC_T = 0; pC_G = 0; pC_C = 0

                    ''' Also, creating an empty dictionary to store transition probabilities
                    - probably upgrade using numpy '''
                    transition_prob_dict = {'A_A' : 0, 'A_T' : 0, 'A_G' : 0, 'A_C' : 0,
                    'T_A' : 0, 'T_T' : 0, 'T_G' : 0, 'T_C' : 0,
                    'G_A' : 0, 'G_T' : 0, 'G_G' : 0, 'G_C' : 0,
                    'C_A' : 0, 'C_T' : 0, 'C_G' : 0, 'C_C' : 0}


                    ''' Now, loop through each sample to compute initial probs and transition probs '''
                    for x, y in sample_list:
                        print('sample x and y:', x,y)


                        ''' Update nucleotide probability for this site
                        - only calculated from v1 and only once for each parse/iteration '''
                        if m == 0:
                            pA += (v1[y][n].count('A'))
                            pT += (v1[y][n].count('T'))
                            pG += (v1[y][n].count('G'))
                            pC += (v1[y][n].count('C'))

                            nucleotide_prob_dict['A'] = pA
                            nucleotide_prob_dict['T'] = pT
                            nucleotide_prob_dict['G'] = pG
                            nucleotide_prob_dict['C'] = pC

                        nucleotide_prob = [pA, pT, pG, pC]


                        ''' Now, update transition matrix '''
                        nucl_B1 = (v1[y][n]).split('|')  # nucleotides at Block01
                        nucl_B2 = (v2[y][m]).split('|')  # nucleotides at Block02


                        ''' create possible haplotype configurations between "n" and "m".
                         If the index (PI value) are same we create zip, if index (PI value) are
                         different we create product. '''

                        HapConfig = []  # create empty list

                        if v1[x][n] == v2[x][m]:
                            ziped_nuclB1B2 = [(x + '_' + y) for (x,y) in zip(nucl_B1, nucl_B2)]
                            HapConfig = ziped_nuclB1B2

                            ''' Move this counting function else where '''
                            pA_A += (HapConfig.count('A_A'))  # (v1[y][0].count('A'))/8
                            pA_T += (HapConfig.count('A_T'))
                            pA_G += (HapConfig.count('A_G'))
                            pA_C += (HapConfig.count('A_C'))

                            pT_A += (HapConfig.count('T_A'))  # (v1[y][0].count('A'))/8
                            pT_T += (HapConfig.count('T_T'))
                            pT_G += (HapConfig.count('T_G'))
                            pT_C += (HapConfig.count('T_C'))

                            pG_A += (HapConfig.count('G_A'))  # (v1[y][0].count('A'))/8
                            pG_T += (HapConfig.count('G_T'))
                            pG_G += (HapConfig.count('G_G'))
                            pG_C += (HapConfig.count('G_C'))

                            pC_A += (HapConfig.count('C_A'))
                            pC_T += (HapConfig.count('C_T'))
                            pC_G += (HapConfig.count('C_G'))
                            pC_C += (HapConfig.count('C_C'))


                        if v1[x][n] != v2[x][m]:
                            prod_nuclB1B2 = [(x + '_' + y) for (x,y) in product(nucl_B1, nucl_B2)]
                            HapConfig = prod_nuclB1B2
                            print('prod NuclB1B2: ', prod_nuclB1B2)

                            pA_A += (HapConfig.count('A_A'))/2
                            pA_T += (HapConfig.count('A_T'))/2
                            pA_G += (HapConfig.count('A_G'))/2
                            pA_C += (HapConfig.count('A_C'))/2

                            pT_A += (HapConfig.count('T_A'))/2
                            pT_T += (HapConfig.count('T_T'))/2
                            pT_G += (HapConfig.count('T_G'))/2
                            pT_C += (HapConfig.count('T_C'))/2

                            pG_A += (HapConfig.count('G_A'))/2
                            pG_T += (HapConfig.count('G_T'))/2
                            pG_G += (HapConfig.count('G_G'))/2
                            pG_C += (HapConfig.count('G_C'))/2

                            pC_A += (HapConfig.count('C_A'))/2
                            pC_T += (HapConfig.count('C_T'))/2
                            pC_G += (HapConfig.count('C_G'))/2
                            pC_C += (HapConfig.count('C_C'))/2


                    ''' Now, compute nucleotide and transition probabilities for each nucleotide
                    from each 0-n to 0-m at each sample. This updates the transition matrix in
                    each loop. **Note: At the end this transition probabilities should sum to 1 '''

                    ''' Storing nucleotide probabilities '''
                    nucleotide_prob = [pA, pT, pG, pC]

                    ''' Storing transition probability as dict'''
                    transition_prob_dict['A_A'] = sum_Probs(pA_A,pA)
                    transition_prob_dict['A_T'] = sum_Probs(pA_T,pA)
                    transition_prob_dict['A_G'] = sum_Probs(pA_G,pA)
                    transition_prob_dict['A_C'] = sum_Probs(pA_C,pA)

                    transition_prob_dict['T_A'] = sum_Probs(pT_A,pT)
                    transition_prob_dict['T_T'] = sum_Probs(pT_T,pT)
                    transition_prob_dict['T_G'] = sum_Probs(pT_G,pT)
                    transition_prob_dict['T_C'] = sum_Probs(pT_C,pT)

                    transition_prob_dict['G_A'] = sum_Probs(pG_A,pG)
                    transition_prob_dict['G_T'] = sum_Probs(pG_T,pG)
                    transition_prob_dict['G_G'] = sum_Probs(pG_G,pG)
                    transition_prob_dict['G_C'] = sum_Probs(pG_C,pG)

                    transition_prob_dict['C_A'] = sum_Probs(pC_A,pC)
                    transition_prob_dict['C_T'] = sum_Probs(pC_T,pC)
                    transition_prob_dict['C_G'] = sum_Probs(pC_G,pC)
                    transition_prob_dict['C_C'] = sum_Probs(pC_C,pC)


                    '''Now, we start solving the haplotype configurations after we have the
                      required data (nucleotide and transition probabilities).
                    Calculate joint probability for each haplotype configuration.
                    Step 01: We calculate the product of the transition from one level
                    of "n" to several levels of "m". '''

                    ''' finding possible configuration between "n" and "m". '''
                    hapB1A_hapB2A_transition = hapB1A_hapB2A[0][n] + '_' + hapB1A_hapB2A[1][m]
                    hapB1B_hapB2B_transition = hapB1B_hapB2B[0][n] + '_' + hapB1B_hapB2B[1][m]

                    hapB1A_hapB2B_transition = hapB1A_hapB2B[0][n] + '_' + hapB1A_hapB2B[1][m]
                    hapB1B_hapB2A_transition = hapB1B_hapB2A[0][n] + '_' + hapB1B_hapB2A[1][m]


                    ''' computing the products of transition probabilities on the for loop '''
                    POTP_hapB1AB2A *= transition_prob_dict[hapB1A_hapB2A_transition]
                    POTP_hapB1BB2B *= transition_prob_dict[hapB1B_hapB2B_transition]

                    POTP_hapB1AB2B *= transition_prob_dict[hapB1A_hapB2B_transition]
                    POTP_hapB1BB2A *= transition_prob_dict[hapB1B_hapB2A_transition]


                ''' Step 02: sum of the product of the transition probabilities '''
                sumOf_PT_hapB1A_B2A += POTP_hapB1AB2A
                sumOf_PT_hapB1B_B2B += POTP_hapB1BB2B

                sumOf_PT_hapB1A_B2B += POTP_hapB1AB2B
                sumOf_PT_hapB1B_B2A += POTP_hapB1BB2A


            ''' Step 03: Now, computing the likely hood of each haplotype configuration '''
            print('computing likelyhood:')

            likelyhood_hapB1A_with_B2A_Vs_B2B = LH_hapB1AwB2AvsB2B = 
                sumOf_PT_hapB1A_B2A / sumOf_PT_hapB1A_B2B
            likelyhood_hapB1B_with_B2B_Vs_B2A = LH_hapB1BwB2BvsB2A = 
                sumOf_PT_hapB1B_B2B / sumOf_PT_hapB1B_B2A

            likelyhood_hapB1A_with_B2B_Vs_B2A = LH_hapB1AwB2BvsB2A = 
                sumOf_PT_hapB1A_B2B / sumOf_PT_hapB1A_B2A
            likelyhood_hapB1B_with_B2A_Vs_B2B = LH_hapB1BwB2AvsB2B = 
                sumOf_PT_hapB1B_B2A / sumOf_PT_hapB1B_B2B


            print('nlikely hood Values: B1A with B2A vs. B2B, B1B with B2B vs. B2A')
            print(LH_hapB1AwB2AvsB2B, LH_hapB1BwB2BvsB2A)

            print('nlikely hood Values: B1A with B2B vs. B2A, B1B with B2A vs. B2B')
            print(LH_hapB1AwB2BvsB2A, LH_hapB1BwB2AvsB2B)


            ''' Update the phase state of the block based on evidence '''
            if (LH_hapB1AwB2AvsB2B + LH_hapB1BwB2BvsB2A) > 
                4*(LH_hapB1AwB2BvsB2A + LH_hapB1BwB2AvsB2B):
                print('Block-1_A is phased with Block-2_A')

                for x in range(len(v1['ms02g_PI'])):
                    PI_value = v1['ms02g_PI'][x]
                    # Note: so, we trasfer the PI value from ealier block to next block

                    f.write('t'.join([v1['chr'][x], v1['pos'][x], v1['ms02g_PI'][x],
                                       hapB1A_hapB2A[0][x] + '|' + hapB1B_hapB2B[0][x], 'n']))

                for x in range(len(v2['ms02g_PI'])):
                    f.write('t'.join([v2['chr'][x], v2['pos'][x], PI_value,
                                       hapB1A_hapB2A[1][x] + '|' + hapB1B_hapB2B[1][x], 'n']))

            elif (LH_hapB1AwB2AvsB2B + LH_hapB1BwB2BvsB2A) < 
                4 * (LH_hapB1AwB2BvsB2A + LH_hapB1BwB2AvsB2B):
                print('Block-1_A is phased with Block-2_B')

                for x in range(len(v1['ms02g_PI'])):
                    PI_value = v1['ms02g_PI'][x]
                    # Note: so, we trasfer the PI value from ealier block to next block...
                        # but, the phase position are reversed

                    f.write('t'.join([v1['chr'][x], v1['pos'][x], v1['ms02g_PI'][x],
                                       hapB1A_hapB2A[0][x] + '|' + hapB1B_hapB2B[0][x], 'n']))

                for x in range(len(v2['ms02g_PI'])):
                    f.write('t'.join([v2['chr'][x], v2['pos'][x], PI_value,
                                       hapB1B_hapB2B[1][x] + '|' + hapB1A_hapB2A[1][x], 'n']))


            else:
                print('cannot solve the phase state with confidence - sorry ')
                for x in range(len(v1['ms02g_PI'])):
                    f.write('t'.join([v1['chr'][x], v1['pos'][x], v1['ms02g_PI'][x],
                                   hapB1A_hapB2A[0][x] + '|' + hapB1B_hapB2B[0][x], 'n']))

                for x in range(len(v2['ms02g_PI'])):
                    f.write('t'.join([v2['chr'][x], v2['pos'][x], v2['ms02g_PI'][x],
                                       hapB1A_hapB2A[1][x] + '|' + hapB1B_hapB2B[1][x], 'n']))


Get this bounty!!!