Skip to content

Commit

Permalink
Close #375 by comparing consensus sequences where coverage score > 1.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Dec 9, 2016
1 parent 232b57a commit 6469ab3
Showing 1 changed file with 148 additions and 49 deletions.
197 changes: 148 additions & 49 deletions release_test_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,18 @@
from __future__ import print_function
from argparse import ArgumentParser
import csv
from difflib import Differ
from difflib import Differ, SequenceMatcher
from itertools import izip_longest
from glob import glob
from operator import itemgetter
import os


from micall.settings import rawdata_mount, pipeline_version, DONE_PROCESSING
from micall.settings import pipeline_version, DONE_PROCESSING


# set((sample, seed)) that had a coverage score better than 1 in source or target.
scored_samples = set()


def parse_args():
Expand All @@ -23,74 +28,155 @@ def parse_args():
return parser.parse_args()


def select_columns(csv_path,
column_names,
all_sample_names=None,
filter_sample_names=None):
""" Return lines from a CSV file with only some of the columns.
def select_rows(csv_path,
all_sample_names=None,
filter_sample_names=None):
""" Yield rows from a CSV file reader.
:param csv_path: path to the CSV file
:param column_names: columns to include
:param all_sample_names: a set to receive all values for the 'sample'
column, or None
:param filter_sample_names: a set of sample names to include in the
output, or None to not filter
"""
lines = []
with open(csv_path, 'rU') as f:
reader = csv.DictReader(f)
for row in reader:
if row.get('on.score', None) == '1':
continue
x4_pct = row.get('X4pct', None)
if x4_pct not in (None, ''):
row['X4pct'] = str(int(round(float(x4_pct))))
if all_sample_names is not None:
all_sample_names.add(row['sample'])
if filter_sample_names is None or row['sample'] in filter_sample_names:
lines.append(', '.join(itemgetter(*column_names)(row)) + '\n')
lines.sort()
return lines
yield row


def select_columns(rows, *column_names):
""" Yield lines from a CSV file with only some of the columns.
:param rows: rows from a CSV file reader
:param column_names: columns to include
"""
for row in rows:
score = row.get('on.score', None)
if score == '1':
continue
if score is not None:
scored_samples.add((row['sample'], row['seed']))
x4_pct = row.get('X4pct', None)
if x4_pct not in (None, ''):
row['X4pct'] = str(int(round(float(x4_pct))))
yield ', '.join(itemgetter(*column_names)(row)) + '\n'


def compare_coverage_scores(source_path, target_path):
columns = ['sample',
'seed',
'region',
'project',
'on.score']
filename = 'coverage_scores.csv'
def compare_files(source_path, target_path, filename, *columns):
sample_names = set()
target_columns = select_columns(os.path.join(target_path, filename),
columns,
all_sample_names=sample_names)
source_columns = select_columns(os.path.join(source_path, filename),
columns,
filter_sample_names=sample_names)
target_rows = select_rows(os.path.join(target_path, filename),
all_sample_names=sample_names)
target_columns = sorted(select_columns(target_rows, *columns))
source_rows = select_rows(os.path.join(source_path, filename),
filter_sample_names=sample_names)
source_columns = sorted(select_columns(source_rows, *columns))
compare_columns(source_columns, target_columns, target_path, filename)


def compare_columns(source_columns, target_columns, target_path, filename):
differ = Differ()
for i, diff in enumerate(differ.compare(source_columns, target_columns)):
if i == 0:
print('Coverage score changes in {}:'.format(target_path))
print('{} changes in {}:'.format(filename, target_path))
print(diff, end='')


def compare_g2p_scores(source_path, target_path):
columns = ['sample',
'X4pct',
'final']
filename = 'g2p_summary.csv'
def format_key(key):
return ', '.join(key)


def compare_consensus(source_path, target_path):
filename = 'conseq.csv'
sample_names = set()
target_columns = select_columns(os.path.join(target_path, filename),
columns,
all_sample_names=sample_names)
source_columns = select_columns(os.path.join(source_path, filename),
columns,
filter_sample_names=sample_names)
differ = Differ()
for i, diff in enumerate(differ.compare(source_columns, target_columns)):
if i == 0:
print('G2P changes in {}:'.format(target_path))
print(diff, end='')
keygetter = itemgetter('sample', 'region', 'consensus-percent-cutoff')
target_rows = sorted((row
for row in select_rows(os.path.join(target_path, filename),
all_sample_names=sample_names)
if (row['sample'], row['region']) in scored_samples),
key=keygetter)
source_rows = sorted((row
for row in select_rows(os.path.join(source_path, filename),
filter_sample_names=sample_names)
if (row['sample'], row['region']) in scored_samples),
key=keygetter)
target_keys = map(keygetter, target_rows)
source_keys = map(keygetter, source_rows)
for source_row, target_row in zip(source_rows, target_rows):
source_offset = int(source_row['offset'])
target_offset = int(target_row['offset'])
min_offset = min(source_offset, target_offset)
source_row['sequence'] = '-' * (source_offset-min_offset) + source_row['sequence']
target_row['sequence'] = '-' * (target_offset-min_offset) + target_row['sequence']
source_row['offset'] = target_row['offset'] = str(min_offset)
source_lines = list(select_columns(source_rows,
'sample',
'region',
'consensus-percent-cutoff',
'offset',
'sequence'))
target_lines = list(select_columns(target_rows,
'sample',
'region',
'consensus-percent-cutoff',
'offset',
'sequence'))
print('{} changes in {}:'.format(filename, target_path))
matcher = SequenceMatcher(a=source_keys, b=target_keys, autojunk=False)
for tag, alo, ahi, blo, bhi in matcher.get_opcodes():
if tag == 'replace':
for source_index, target_index in izip_longest(range(alo, ahi), range(blo, bhi)):
if source_index is None:
print('+ ' + format_key(target_keys[target_index]))
elif target_index is None:
print('- ' + format_key(source_keys[source_index]))
else:
source_key = source_keys[source_index]
target_key = target_keys[target_index]
if source_key != target_key:
source_line = format_key(source_key) + '\n'
target_line = format_key(target_key) + '\n'
else:
source_line = source_lines[source_index]
target_line = target_lines[target_index]
diff = line_diff(source_line, target_line)
print(''.join(diff), end='')
elif tag == 'delete':
for key in source_keys[alo:ahi]:
print('- ' + ', '.join(key))
elif tag == 'insert':
for key in target_keys[blo:bhi]:
print('+ ' + ', '.join(key))
else:
assert tag == 'equal', tag
for key in source_keys[alo:ahi]:
print(' ' + ', '.join(key) + '==')


def line_diff(source_line, target_line):
cruncher = SequenceMatcher(a=source_line, b=target_line, autojunk=False)
lines = ['- ', '? ', '+ ', '? ']
for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
adiff = bdiff = ' '
if tag == 'replace':
adiff = bdiff = '^'
elif tag == 'delete':
adiff = '-'
elif tag == 'insert':
bdiff = '+'
else:
assert tag == 'equal', tag
lines[0] += source_line[alo:ahi]
lines[2] += target_line[blo:bhi]
lines[1] += adiff * (ahi - alo)
lines[3] += bdiff * (bhi - blo)

lines[1] += '\n'
lines[3] += '\n'
return lines


def main():
Expand All @@ -114,8 +200,21 @@ def main():
source_versions = os.listdir(source_results_path)
source_versions.sort()
source_path = os.path.join(source_results_path, source_versions[-1])
compare_coverage_scores(source_path, target_path)
compare_g2p_scores(source_path, target_path)
compare_files(source_path,
target_path,
'coverage_scores.csv',
'sample',
'seed',
'region',
'project',
'on.score')
compare_files(source_path,
target_path,
'g2p_summary.csv',
'sample',
'X4pct',
'final')
compare_consensus(source_path, target_path)
print('Done: ' + run_name)
print('Done.')

Expand Down

0 comments on commit 6469ab3

Please sign in to comment.