/* ----------------------------------------------------------------------
 * make matrix out of a list of sequences
 * Copyright (C) 2000 January Weiner III
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
 * USA.
 ---------------------------------------------------------------------- */

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>

#include "genpak.h"
#include "gp_getopt.h"

#define VERSION "0.1"
#define PROGNAME "gp_mkmtx"

char *progname ;

typedef struct {
	FILE* in ;
	FILE* out ;
	double gc ;
	int log ;
	int freq ; } opt_s ;

double* GetMatrix(FILE* in, int* nel, int* nseq) ;
int DisplayMatrix(FILE *out, int elements, double *t) ;
int ScaleMatrix(double *t, int nseq, int nel, opt_s opt) ;

/*
 *
 */

int main(int argc, char *argv[])
{
	extern int optind ;
	extern char *optarg ;
	opt_s options ;
	double *tablica ;

	int c, el = 0, nseq = 0;
	int errflg = 0 ;

	options.log = TRUE ;
	options.freq = TRUE ;
	options.gc = -1 ;

	progname = argv[0] ;

	while ((c = gp_getopt(argc, argv, "adg:hlqvH")) != EOF)
		switch(c) {
		case 'a':
			options.freq = FALSE ;
			options.log = FALSE ;
			if(debug) gp_warn("Will print only absolute numbers of occurencies") ;
			break ;
		case 'd':
			debug = TRUE ;
			if(quiet) gp_error("Either debug or quiet, pick any one") ;
			gp_warn("Running in debug mode") ;
			break ;
		case 'g':
  		if(sscanf(optarg,"%lf",&options.gc) != 1)
				gp_error("Could not read the command line parameter GC%") ;
			if(debug) 
				gp_warn("Scaling matrix to frequencies expected at %.2f %% GC",options.gc) ;
			break ;
		case 'h':
			Help() ;
			break ;
		case 'l':
			options.log = FALSE ;
			if(debug) gp_warn("Will not apply logarythmic scaling") ;
			break ;
		case 'q':
			quiet = TRUE ;
			if(debug) gp_error("Either debug or quiet, pick any one") ;
			break ;
		case 'v':
			fprintf(stderr,"%s version %s\n",progname,VERSION) ;
			exit(0) ;
			break ;
		case 'H':
			html = TRUE ;
			break ;
		default:
			errflg++ ;
			break;
		}


	if(errflg) gp_error("Type '%s -h' for help",progname) ;

/* open the file pointer to read the sequences 
 * from: standard input or a file provided? */ 
	if(optind >= argc) options.in = stdin ;
	else options.in = gp_file_open(argv[optind],"r") ;

/* opening the file pointer to write the output: 
 * standard output or file provided? */
	optind++ ;

	if(optind >= argc) options.out = stdout ;
	else options.out = gp_file_open(argv[optind],"wb") ;

	tablica = GetMatrix(options.in,&el,&nseq) ;
	ScaleMatrix(tablica,nseq,el,options) ;

	if(debug) gp_warn("%i nucleotides",el) ;
	fprintf(options.out,"# %i sequences, %i elements\n",nseq, el) ;
	DisplayMatrix(options.out,el,tablica) ;
	
	if(html) gp_warn_print_all(options.out) ;
	fclose(options.out) ;
	fclose(options.in) ;
	return(0);
}


int ScaleMatrix(double *t, int nseq, int nel, opt_s opt) {

	int i = 0,j = 0 ;
	double fe[4] ;

	if(opt.freq) {
		for(i = 0;i<nel*4;i++) t[i] = t[i]/nseq ;
	}

	if(opt.gc > 0) {
		/*gp_warn("Scaling to GC= %f %%",opt.gc) ;*/
		fe[0] = fe[3] = (100 - opt.gc)/200 ;
		fe[1] = fe[2] = opt.gc/200 ;
		for(i = 0;i<(nel*4);i += 4) 
			for(j = 0;j<4;j++) 
				t[i+j] = t[i+j]/fe[j] ;
	}

	if(opt.log) for(i = 0;i<nel*4;i++) {
			if(t[i] == 0) t[i] = 0.0001 ;
			t[i] = log(t[i]);
		}

	return(0) ;


}

/* 
 * GetMatrix reads a file with sequences, and stores the numbers of nucleotides
 * appearing at a given position in the table t, which is then returned by this
 * function. Number of revised sequences is stored in nseq, and the length of the
 * shortest sequence is stored in nel (number of elements). 
 */

double* GetMatrix(FILE* in, int* nel, int* nseq) {

	char c ;
	int i,n = 0,max = 5 ;
	double *t ;
	int C[128] ;
	sekw *s ;
	nel[0] = max ;
	nseq[0] = 0 ;
	/* they typical jw3 sequence conversion... */
	C['A'] = 0 ; C['C'] = 1 ; C['G'] = 2 ; C['T'] = 3 ; C['U'] = 3 ;
	

	/* allocating some initial space */
	t = (double*) calloc(max*4,sizeof(double)) ;

	while((s = gp_seq_read_fragment(in,0,0,0)) != NULL) {
		n++ ;
		if(debug) gp_warn("GetMatrix: processing sequence %i",n) ;

		if(s->leng > nel[0]) {

			/* we have to make sure that there is enough memory to allocate the
			 * matrix. */
			if(n == 1) {
				if(debug) gp_warn("reallocating sequence, max length %i",s->leng) ;
				t = realloc(t,sizeof(double)*(s->leng)*4) ;

				if(t == NULL) 
					gp_error("GetMatrix: could not allocate memory for the first sequence");

				for(i = (4*nel[0]);i<(4*s->leng);i++) t[i] = 0 ;
				if(debug) gp_warn("sequence reallocated") ;
				nel[0] = s->leng ;
			}
			
		} else {
			/* the matrix is not longer then the shortest sequence evaluated */
			nel[0] = s->leng ;
		}

		for(i = 0;i<nel[0];i++) {
			c = toupper(s->sequ[i]) ;
			if(strchr("ATCGU",c) != NULL)
				t[i*4+C[c]] += 1 ;
			/* printf("%c	%c	%i	%f",s->sequ[i],c,C[c],t[i*4+C[c]]) ;
			printf("	%.0f	%.0f	%.0f	%.0f\n",
				t[i*4+0], t[i*4+1], t[i*4+2], t[i*4+3]) ;*/
			
		}
	}

	if(n<1) {
		fclose(in) ;
		gp_error("No sequences found") ;
	}
	
	nseq[0] = n ;
	if(debug) gp_warn("GetMatrix: %i sequences read",n) ;
	if(debug) DisplayMatrix(stdout,nel[0],t) ;

	return(t) ;
}


/* format the matrix to the output */
int DisplayMatrix(FILE *out, int elements, double *t) {


	int i = 0 ;

	fprintf(out,"# A	C	G	T	\n") ;
	for(i = 0;i<elements;i++) {
		fprintf(out,"%.4f\t%.4f\t%.4f\t%.4f\n",
			t[i*4],t[i*4+1],t[i*4+2],t[i*4+3]) ;
	}

	return(0) ;

}

/* Standard mesage */

void Help() {
printf("\n");
printf("%s version %s - computate a nucleotide frequency matrix",PROGNAME,VERSION);
printf("\n");
printf("  Usage:\n");
printf("     %s [options] [ input file ] [ output file ]\n",progname);
printf("\n");
printf("  Options:\n");
printf("     -a       : Instead of frequencies, print just the numbers of occurences\n");
printf("     -d       : print print lots of debugging information\n");
printf("     -g value : divide each frequency by the expected frequecy of the given\n");
printf("                nucleotide at the GC%% [value]\n");
printf("     -h       : print this help screen & exit\n");
printf("     -l       : do not apply a logarythmic scaling of the frequencies\n");
printf("     -v       : print version information & exit\n");
printf("     -H       : Use an output format suitable for embedding in HTML\n\n");
exit(0);
}


			
