lh3 · vsbuffalo · May 11, 2013 · May 11, 2013
diff --git a/README.bio b/README.bio
@@ -2,39 +2,42 @@ Bioawk is an extension to Brian Kernighan's awk acquired from [1], adding the
 support of several common biological data formats, including optionally gzip'ed
 BED, GFF, SAM, VCF, FASTA/Q and TAB-delimited formats with the column names. It
 also adds a few built-in functions including, as of now, and(), or(), xor(),
-reverse() and revcomp(). The following are a few examples demonstrating the new
-functionality:
+reverse(), revcomp(), and fastx(). The following are a few examples demonstrating 
+the new functionality:
 
 1. Extract unmapped reads without header:
 
-   awk -c sam 'and($flag,4)' aln.sam.gz
+   awk -c sam 'and($flag, 4)' aln.sam.gz
 
 2. Extract mapped reads with header:
 
-   awk -c sam -H '!and($flag,4)'
+   awk -c sam -H '!and($flag, 4)'
 
 3. Reverse complement FASTA:
 
-   awk -c fastx '{print ">"$name;print revcomp($seq)}' seq.fa.gz
+   awk -c fastx '{print fastx($name, revcomp($seq))}' seq.fa.gz
 
 4. Create FASTA from SAM (uses revcomp if FLAG & 16)
 
    samtools view aln.bam | \
-      awk -c sam '{s=$seq; if(and($flag, 16)) {s=revcomp($seq)} print ">"$qname"\n"s}'
+      awk -c sam '{s=$seq; if(and($flag, 16)) {s=revcomp($seq)} print fastx($qname, s)}'
 
-5. Get the %GC from FASTA:
+5. Get the %GC table from FASTA:
 
-   awk -c fastx '{print ">"$name;print gc($seq)}' seq.fa.gz
+   awk -c fastx '{print $name, gc($seq)}' seq.fa.gz
 
-6. Get the mean Phred quality score from FASTQ:
+6. Get the mean Phred quality score table from FASTQ:
 
-   awk -c fastx '{print ">"$name;print meanqual($qual)}' seq.fq.gz
+   awk -c fastx '{print $name, meanqual($qual)}' seq.fq.gz
 
 7. Take column name from the first line (where "age" appears in the first line
    of input.txt):
 
    awk -c header '{print $age}' input.txt
 
+8. Use awk's redirection to split a FASTA file by sequence lengths.
+
+   awk -c fastx '{if (length($seq) < 35) {print fastx($name, $seq) > "short.fasta"} else {print fastx($name, $seq) > "long.fasta"}}' seq.fq.gz
 
 Note that when "-c" is not specified and the new built-in functions are not
 used, bioawk should behave exactly the same as the original BWK awk. At least

diff --git a/addon.c b/addon.c
@@ -230,6 +230,34 @@ Cell *bio_func(int f, Cell *x, Node **a)
 				if (buf[i] - 33 >= thres) ++cnt;
 			setfval(y, (Awkfloat)cnt);
 		}
+	} else if (f == BIO_FFASTX) {
+		if (a[1]->nnext == 0) {
+			FATAL("fastx requires at least two arguments");
+		} else {
+			char *buf, *name, *seq, *qual;
+			int bufsz=3*recsize;
+			int has_qual;
+			z = execute(a[1]->nnext);
+			if ((has_qual = a[1]->nnext->nnext != 0)) {
+				y = execute(a[1]->nnext->nnext);
+				qual = getsval(y);
+			}
+
+			if ((buf = (char *) malloc(bufsz)) == NULL)
+				FATAL("out of memory in fastx");
+
+			name = getsval(x);
+			seq = getsval(z);
+			if (!has_qual) {
+				sprintf(buf, ">%s\n%s", name, seq);
+			} else {
+				if (strlen(seq) != strlen(qual))
+					WARNING("fastx arguments seq and qual are not same length");
+				sprintf(buf, "@%s\n%s\n+\n%s", name, seq, qual); 
+			}
+			setsval(y, buf);
+			free(buf);
+		}
 	} /* else: never happens */
 	return y;
 }

diff --git a/addon.h b/addon.h
@@ -41,6 +41,7 @@ int bio_getrec(char **pbuf, int *psize, int isrecord);
 #define BIO_FMEANQUAL 204
 #define BIO_FQUALCOUNT 205
 #define BIO_FTRIMQ    206
+#define BIO_FFASTX    207
 
 
 struct Cell;

diff --git a/lex.c b/lex.c
@@ -58,11 +58,12 @@ Keyword keywords[] ={	/* keep sorted: binary searched */
 	{ "else",	ELSE,		ELSE },
 	{ "exit",	EXIT,		EXIT },
 	{ "exp",	FEXP,		BLTIN },
+	{ "fastx",	BIO_FFASTX,	BLTIN },
 	{ "fflush",	FFLUSH,		BLTIN },
 	{ "for",	FOR,		FOR },
 	{ "func",	FUNC,		FUNC },
 	{ "function",	FUNC,		FUNC },
-	{ "gc",	BIO_FGC,		BLTIN },
+	{ "gc",		BIO_FGC,       	BLTIN },
 	{ "getline",	GETLINE,	GETLINE },
 	{ "gsub",	GSUB,		GSUB },
 	{ "if",		IF,		IF },
@@ -72,7 +73,7 @@ Keyword keywords[] ={	/* keep sorted: binary searched */
 	{ "length",	FLENGTH,	BLTIN },
 	{ "log",	FLOG,		BLTIN },
 	{ "match",	MATCHFCN,	MATCHFCN },
-	{ "meanqual",	BIO_FMEANQUAL,		BLTIN },
+	{ "meanqual",	BIO_FMEANQUAL,	BLTIN },
 	{ "next",	NEXT,		NEXT },
 	{ "nextfile",	NEXTFILE,	NEXTFILE },
 	{ "or",		BIO_FOR,	BLTIN },
@@ -81,8 +82,8 @@ Keyword keywords[] ={	/* keep sorted: binary searched */
 	{ "qualcount",	BIO_FQUALCOUNT,	BLTIN },
 	{ "rand",	FRAND,		BLTIN },
 	{ "return",	RETURN,		RETURN },
-	{ "revcomp",BIO_FREVCOMP, BLTIN },
-	{ "reverse",BIO_FREVERSE, BLTIN },
+	{ "revcomp",	BIO_FREVCOMP,	BLTIN },
+	{ "reverse",	BIO_FREVERSE,	BLTIN },
 	{ "sin",	FSIN,		BLTIN },
 	{ "split",	SPLIT,		SPLIT },
 	{ "sprintf",	SPRINTF,	SPRINTF },