/* -------------------------------------------------------------------------- */
/* Copyright (C) 2011, Georgia Institute of Technology.                       */
/* All rights reserved.                                                       */
/* See COPYING for license.                                                   */
/* -------------------------------------------------------------------------- */

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

#include "gen_reads.h"
#include "common.h"


#define MY_TYPE unsigned long long
                        /*A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T*/      
int ch_table[20] = {0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3};
char nt_table[4] = {'A', 'C', 'G', 'T'}; 

static int read_fasta_file (char * fname, MY_TYPE * len, char ** chrom, char ** chrom_c)
{
    FILE * fp;
    char ch;
    const int maxline = 5000;
    char line[5000];
    MY_TYPE chr_len = 0;
    int line_len = 0;
    int i;
    char * chr;
    char * chr_c;
    
    if ((fp = fopen (fname, "r")) == NULL)
    {
        DPRINTF (1, "Error: read fasta file %s failed.\n", fname);
        return -1;
    }

    ch = getc(fp);
    if (ch != '>')
    {
        DPRINTF (1, "%s does not seem to be in FastA format\n", fname);
    }
    ungetc(ch, fp);

    while (fgets(line, maxline, fp) != NULL)
    {
        chr_len += strlen(line);
    }

    chr_c = (char *)malloc ((chr_len + 1) * sizeof(char));
    chr = (char *)malloc ((chr_len + 1) * sizeof(char));
    chr_len = 0;
    fseek(fp, SEEK_SET, 0);
    while (fgets(line, maxline, fp) != NULL)
    {
        if (line[0] != '>')
        {
        	line_len = strlen(line);
            //for (i = strlen(line) - 1; i >= 0; i--)
        	for (i = 0; i < line_len; i++)
            {
                ch = line[i];
                switch (ch)
                {
                   case 'A':
                   case 'a':
                       chr[chr_len] = 'A';
                       chr_c[chr_len++] = 'T';
                       break;
                   
                   case 'G':
                   case 'g':
                       chr[chr_len] = 'G';
                       chr_c[chr_len++] = 'C';
                       break;
                   
                   case 'C':
                   case 'c':
                       chr[chr_len] = 'C';
                       chr_c[chr_len++] = 'G';
                       break;
                   
                   case 'T':
                   case 't':
                       chr[chr_len] = 'T';
                       chr_c[chr_len++] = 'A';
                       break;
                }
            }
        }
    }

    chr[chr_len] = '\0';
    chr_c[chr_len] = '\0';
    *len = chr_len;
    fclose(fp);

    *chrom = chr;
    *chrom_c = chr_c;

    return 0;
}


static int generate_reads (char * outfile, char * chrom, char * chrom_c, MY_TYPE len,
                           int coverage, double error, int read_len, unsigned int seed)
{
    printf ("len %Lu %u %u\n", len, coverage, read_len);
    MY_TYPE read_num = len * (MY_TYPE)coverage / (MY_TYPE)read_len;
    printf ("read_num %Lu\n", read_num);
    while((len * coverage) != (read_num * read_len))
    {	    
        printf ("leb %Lu %Lu %Lu\n", len, len * coverage, read_num * read_len);
    	len = read_num * read_len / coverage;
	printf ("leb %Lu\n", len);
    	read_num = len * coverage / read_len;
    }
    assert((len * coverage) == (read_num * read_len));

    MY_TYPE i;
    FILE * fp;
    int j;
    char * str;
    int start, pos;
    double dir_flag;
    double err_flag;
    int num_errs;
    int s;
    int p;
    
    if ((fp = fopen (outfile, "w+")) == NULL)
    {
        DPRINTF (1, "Error: read fasta file failed %s.\n", outfile);
        return -1;
    }
    srand (seed);
    if (read_len > len)
    {
        fprintf (stderr, "read_len %d is larger than the chromosome size\n", read_len);
        return -1;    
    }
    DPRINTF (1, ">sequence_len_used\t%Lu\n", len);
    str = (char *)malloc ((read_len + 1) * sizeof(char));;

    for (i=0; i<read_num; i++)
    {
        num_errs = 0;
        p = 0;
        s = 0;
        start = (int)((double)rand()/RAND_MAX * len);
        dir_flag = (double)rand()/RAND_MAX;
        if (dir_flag > 0.5)
        {
            if (start > len - read_len)
                start = len - read_len;
            pos = start + 1;
            for (j=0; j< read_len; j++)
            {
                err_flag = (double)rand()/RAND_MAX;
                s = ch_table[(int)(chrom[start] - 'A')];
                if (err_flag >= error)
                {
                    p = 0;
                }
                else if (err_flag < error/3)
                {
                    p = 1;
                    num_errs++;
                }
                else if (err_flag >= error/3 && err_flag < 2 * error/3)
                {
                    p = 2;
                    num_errs++;
                }
                else if (err_flag >= error * 2/3 && err_flag < error)
                {
                    p = 3;
                    num_errs++;
                }
                s = (p + s) % 4;
                str[j] = nt_table[s];
                start ++;
            }
        }
        else
        {
            if (start < read_len)
                start = read_len - 1;
            pos = start + 1;
            
            for (j=0; j< read_len; j++)
            {
                err_flag = (double)rand()/RAND_MAX;
                s = ch_table[(int)(chrom_c[start]  - 'A')];
                if (err_flag >= error)
                {
                    p = 0;
                }
                else if (err_flag < error/3)
                {
                    p = 1;
                    num_errs++;
                }
                else if (err_flag >= error/3 && err_flag < 2 * error/3)
                {
                    p = 2;
                    num_errs++;
                }
                else if (err_flag >= error * 2/3 && err_flag < error)
                {
                    p = 3;
                    num_errs++;
                }
                s = (p + s) % 4;                
                str[j] = nt_table[s];
                start --;
            }
        }
        str[read_len] = '\0';
        fprintf (fp, ">READ\t%Lu\t%d\t%d\t%c\t%d\n", i, read_len, pos, ((dir_flag > 0.5)?'f':'r'), num_errs);
        fprintf (fp, "%s\n", str);
    }

    free (str);
    fclose(fp);
    return read_num;
}

/**
 * Generated reads with a given overlap but there may be other reads with other overlap lengths as well
 */
static int generate_reads_with_given_overlap(char * outfile, char * chrom, char * chrom_c, int len,
        int coverage, double error, int read_len, unsigned int seed, unsigned int overlap)
{
	assert (overlap > 0);
	int read_num = len * coverage / read_len;
	int i;
	FILE * fp;
	int j;
	char * str;
	int start, pos;
	double flag;
	unsigned int orientation = NONE;

	if ((fp = fopen (outfile, "w+")) == NULL)
	{
	    DPRINTF (1, "Error: read fasta file failed %s.\n", outfile);
	    return -1;
	}
	srand (seed);
	if (read_len > len)
	{
	    fprintf (stderr, "read_len %d is larger than the chromosome size\n", read_len);
	    return -1;
	}
	str = (char *)malloc ((read_len + 1) * sizeof(char));;

	// Generate a pair of reads each time with a given overlap length
	for (i=0; i<read_num; i++)
	{
		if(i % 2 == 0)	// First read of the pair
		{
			start = (int)((double)rand()/RAND_MAX * len);
			orientation = NONE;
		}
		else			// Overlapping read of the pair
		{
			start = pos - 1;				// previous start position
			if (orientation == REVERSED)	// last read was a reversed read
			{
				start = start - read_len + 1;
			}

			// Select second start position
			if ((start + overlap - read_len) >= 0 &&
					(start + 2 * read_len - overlap) <= len)
			{
				// Toss to select either end
				flag = (double) rand() / RAND_MAX;
				if (flag > 0.5)
				{
					start = start + overlap - read_len;
				}
				else
				{
					start = start + read_len - overlap;
				}
			}
			else if ((start + overlap - read_len) >= 0)
			{
				start = start + overlap - read_len;
			}
			else
			{
				start = start + read_len - overlap;
			}

			// Toss to select orientation
			flag = (double) rand() / RAND_MAX;
			if (flag > 0.5)
			{
				orientation = FORWARD;
			}
			else
			{
				orientation = REVERSED;
				start = start + read_len - 1;
			}
		}

		//Select orientation for the first read.
		if (orientation == NONE)
		{
			flag = (double)rand()/RAND_MAX;
			if (flag > 0.5)
				orientation = FORWARD;
			else
				orientation = REVERSED;
		}

	    if (orientation == FORWARD)
	    {
	        if (start > len - read_len)
	            start = len - read_len;
	        pos = start + 1;
	        strncpy (str, chrom + start, read_len);
	    }
	    else
	    {
	        if (start < read_len)
	            start = read_len - 1;
	        pos = start + 1;
	        for (j=0; j< read_len; j++)
	        {
	            str[j] = chrom_c[start];
	            start --;
	        }
	    }
	    str[read_len] = '\0';
	    fprintf (fp, ">READ\t%d\t%d\t%d\t%c\n", i, read_len, pos, ((flag > 0.5)?'f':'r'));
	    fprintf (fp, "%s\n", str);
	}
    free (str);
    fclose(fp);
    return read_num;
}

static void usage(void)
{
    fprintf (stderr, "Usage: gen_reads -i <input file> -o <output file> -c <coverage> -l <reads length> -e <error rate> (-r seed -m overlap)\n");
}


int main (int argc, char ** argv)
{
    char outfile[255] = "reads.fa";
    MY_TYPE chr_len = 0;
    int read_len = 100;
    double error = 0.0;
    int coverage = 20;
    char infile[255] = "chr.fa";
    MY_TYPE read_num = 0;
    int c = 0;
    int err_flag = 0;
    char * chrom;
    char * chrom_c;
    unsigned int seed = time(NULL);
    unsigned int overlap = 0;

    while ((c = getopt (argc, argv, ":he:l:i:o:c:r:m:")) != -1)
    {
        switch (c)
        {
            case 'h':
                usage();
                break;
            case 'e':
                error = atof(optarg);
                if (error > 1.0 || error < 0.0)
                {
                    fprintf (stderr, "error out of range\n");
                    err_flag = 1;
                }
                break;
            case 'l':
                read_len = atoi(optarg);
                if (read_len <= 0)
                {
                    fprintf (stderr, "read_len out of range\n");
                    err_flag = 1;
                }
                break;
            case 'i':
                strcpy(infile, optarg);
                break;
            case 'o':
                strcpy(outfile, optarg);
                break;
            case 'c':
                coverage = atoi(optarg);
                if (coverage <= 0)
                {
                    fprintf (stderr, "coverage out of range\n");
                    err_flag = 1;
                }
                break;
            case 'r':
				seed = atoi(optarg);
				break;
            case 'm':
            	overlap = abs(atoi(optarg));
            	break;
            case ':':
                fprintf (stderr, "Option -%c requires an argument.\n", optopt);
                err_flag = 1;
                break;
            case '?':
                fprintf (stderr, "Unknown option `-%c'.\n", optopt);
                err_flag = 1;
                break;
             default:
                err_flag = 1;
         }  
    }
    if (err_flag == 1)
    {
        usage();
        return -1;
    }
    
    printf("Genrate reads from fasta file %s\n", infile);
    printf ("Coverage: %d\n", coverage);
    printf ("Read length: %u\n", read_len);
    printf ("Error rate: %lf\n\n", error);
    
    if (read_fasta_file(infile, &chr_len, &chrom, &chrom_c) != 0)
        return -1;
    printf ("Gererating reads from chromosome of length %Lu ...\n\n", chr_len);

    if (overlap == 0)
    {
    	read_num = generate_reads(outfile, chrom, chrom_c, chr_len, coverage, error, read_len, seed);
    }
    else
    {
    	read_num = generate_reads_with_given_overlap(outfile, chrom, chrom_c, chr_len, coverage, error, read_len, seed, overlap);
    }
    printf ("Output fasta file %s with %Lu reads\n", outfile, read_num);

    free (chrom);
    free (chrom_c);
    return 0;
}
