#include "postgres.h"
#include "fmgr.h"
#include "miscadmin.h"
#include <math.h>

/* README
 * 
 * some useful references:
 * - http://www.postgresql.org/docs/9.0/interactive/xfunc-c.html
 *
 * NOTE: we use calling convention version 1 (the newer one) here
 *
 * INSTALL: make && make install && psql <db> -f randomness.sql
 */

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif

Datum expected_random_entropy(PG_FUNCTION_ARGS);
Datum byte_entropy(PG_FUNCTION_ARGS);
Datum bit_entropy(PG_FUNCTION_ARGS);
Datum chi_square(PG_FUNCTION_ARGS);

#define BYTE_SIZE 256
#define BIT_SIZE 2

/* This function calculates the byte entropy of a given bytea:
 *   http://en.wikipedia.org/wiki/Entropy_%28information_theory%29#Definition
 * return value in [0.0, 8.0]
 */
PG_FUNCTION_INFO_V1(byte_entropy);
Datum
byte_entropy(PG_FUNCTION_ARGS) {
    bytea *data = PG_GETARG_BYTEA_P(0);
    unsigned char *ptr = (unsigned char *) VARDATA(data);
    int32 tcount = VARSIZE(data) - VARHDRSZ, i;
    float8 entropy = 0, p;
    unsigned char c;
   
    // initialize character counts
    int ccount[BYTE_SIZE];
    for (i = 0; i < BYTE_SIZE; i++) {
        ccount[i] = 0;
    }

    // count characters
    for (i = VARSIZE(data) - VARHDRSZ; i; i--) {
        if (!i%1000) CHECK_FOR_INTERRUPTS();
        c = *ptr++;
        ccount[c]++;
    }

    if (tcount != 0) { // entropy is 0 if bytea was empty
        // calculate the probabilities for each char and sum entropy
        for (i = 0; i < BYTE_SIZE; i++) {
            p = ccount[i] / (float8) tcount;
            if (p == 0)
                continue;   // if p = 0 then log(p) = 0

            entropy -= p * log(p) / log(2);
        }
    }

    PG_RETURN_FLOAT8(entropy);
}

/* This function calculates the byte entropy of a given bytea:
 *   http://en.wikipedia.org/wiki/Entropy_%28information_theory%29#Definition
 * return value in [0.0, 8.0]
 */
PG_FUNCTION_INFO_V1(expected_random_entropy);
Datum
expected_random_entropy(PG_FUNCTION_ARGS) {
    // this constant defines how many different random datablobs
    // we are going to inspect. the higher, the more accurate, but
    // the costs also increase nearly linearly.
    # define ITER 10

    // data length we calculate the entropy for
    int32 length = PG_GETARG_INT32(0);
    // alphabet size, e.g. 64 for base64 or 256 for binary/ascii data
    int32 alph_size = PG_GETARG_INT32(1);

    float8 entropies[ITER];
    float8 p, avg = 0.0;
    int4 ccount[BYTE_SIZE];
    int4 i, w, b;

    if (length <= 0 || alph_size <= 0 || length > 65536 || alph_size > 256) {
        ereport( WARNING, (
            errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
            errmsg( "specifying an alphabet size <= 0 or >= 256 doesn't make sense, really. also the length should be between 0 and 65536. Returning -1" )));
        
        PG_RETURN_FLOAT8((float8) -1.0);
    }

    srand(1337); // initialization
    for (i = 0; i < ITER; i++) {
        entropies[i] = (float8) 0.0;
        if (!i%1000) CHECK_FOR_INTERRUPTS();
        
        // initialize count array
        for (b = 0; b < BYTE_SIZE; b++) {
            ccount[b] = 0;
        }

        // fill <length> words into sample distribution and count
        for (w = 0; w < length; w++) {
            if (!w%1000) CHECK_FOR_INTERRUPTS();
            // this is the preferred way to calculate a random int number
            // check man rand to verify
            b = (int4) (((float8) alph_size) * rand() / (RAND_MAX+1.0));
            //b = (int) (rand() / (RAND_MAX+1.0) * (alph_size));
            //assert(b >= 0);
            //assert(b < alph_size);
            ccount[b]++;
        }

        // calculate the probabilities for each bit and sum entropy
        for (b = 0; b < BYTE_SIZE; b++) {
            if (ccount[b] == 0)
                continue;   // if p = 0 then log(p) = 0
            
            p = ccount[b] / (float8) length;
            entropies[i] -= (float8) (p * log(p) / log(2));

            ccount[b] = 0;
        }
    }
    // calculate avg
    avg = 0.0;
    for (i = 0; i < ITER; i++) {
        avg += (entropies[i] / ITER);
    }

    PG_RETURN_FLOAT8(avg);
}

/* This function calculates the bit entropy of a given bytea:
 *   http://en.wikipedia.org/wiki/Entropy_%28information_theory%29#Definition
 * return value in [0.0, 1.0]
 */
PG_FUNCTION_INFO_V1(bit_entropy);
Datum
bit_entropy(PG_FUNCTION_ARGS) {
    #define ITER 10

    bytea *data = PG_GETARG_BYTEA_P(0);
    unsigned char *ptr = (unsigned char *) VARDATA(data);
    int32 tcount = (VARSIZE(data) - VARHDRSZ) * 8, i, j;
    float8 entropy = 0, p;
    unsigned char c;
   
    // initialize character counts
    int ccount[BIT_SIZE];
    for (i = 0; i < BIT_SIZE; i++) {
        ccount[i] = 0;
    }

    // count characters
    for (i = VARSIZE(data) - VARHDRSZ; i; i--) {
        if (!i%1000) CHECK_FOR_INTERRUPTS();
        c = *ptr++;
        for (j = 0; j < 8; j++) {
            if (c & (1 << j)) {
                ccount[1]++;
            } else {
                ccount[0]++;
            }
        }
    }

    if (tcount != 0) { // entropy is 0 if bytea was empty
        // calculate the probabilities for each bit and sum entropy
        for (i = 0; i < BIT_SIZE; i++) {
            p = ccount[i] / (float8) tcount;
            if (p == 0)
                continue;   // if p = 0 then log(p) = 0

            entropy -= p * log(p) / log(2);
        }
    }

    PG_RETURN_FLOAT8(entropy);
}

PG_FUNCTION_INFO_V1(chi_square);
Datum
chi_square(PG_FUNCTION_ARGS) {
    bytea *data = PG_GETARG_BYTEA_P(0);
    unsigned char *ptr = (unsigned char *) VARDATA(data);
    int32 tcount = VARSIZE(data) - VARHDRSZ, i;
    float8 chisq = 0, cexp;
    unsigned char c;
   
    // initialize character counts
    int ccount[BYTE_SIZE];
    for (i = 0; i < BYTE_SIZE; i++) {
        ccount[i] = 0;
    }

    // count characters
    for (i = VARSIZE(data) - VARHDRSZ; i; i--) {
        if (!i%1000) CHECK_FOR_INTERRUPTS();
        c = *ptr++;
        ccount[c]++;
    }

    if (tcount != 0) { // chisq is 0 if bytea was empty
        // calculate the chi-square per char
        for (i = 0; i < BYTE_SIZE; i++) {
            cexp = tcount / (float8) BYTE_SIZE;
            chisq += (ccount[i] - cexp) * (ccount[i] - cexp) / cexp;
        }
    }

    PG_RETURN_FLOAT8(chisq);
}
