We examined the consistency between statistics derived from the NF and TF model forms. Specifically, we contrast the likelihood ratio tests which compare a baseline and dinucleotide parameter against the simpler baseline parameterisation, and the maximum likelihood estimators obtained under the alternate hypotheses. The substitution models were identically constructed, aside from the defining difference between the NF and TF forms (matrix weighting), and optimised with the identical procedure.
The module responsible to executing this procedure (fit_dinuc_individ.py) is shown below.
from __future__ import division
import os, optparse
from pprint import pprint
from cogent import LoadSeqs, LoadTree, DNA, LoadTable
from cogent.evolve.substitution_model import Nucleotide
from fit_util import make_pwise, _nucs, _dinucs, opt,\
motif_probs_from_monomers, get_ci, get_data_subpath, create_path
def iterate_specific_dinuc_preds(test_run=False):
"""returns the (name, pred) series
pred series is a list of the nuc_GTR preds and one dinucleotide pred which
can be used to directly specify a substitution model"""
nuc_baseline = make_pwise(_nucs)
nuc_baseline.pop(-1)
# return the nucleotide baseline first
yield 'baseline', nuc_baseline
dinuc_gtr_preds = make_pwise(_dinucs)
if test_run:
dinuc_gtr_preds = dinuc_gtr_preds[:2]
for dinuc in dinuc_gtr_preds:
yield str(dinuc), nuc_baseline + [dinuc]
def fit_model_series(aln, use_monomer_probs, with_CI=False, show_progress=True, max_evaluations = 1000000):
"""fit's a series of different predicate sets to the same alignment. The
BASE model (only nucleotide GTR predicates) is fit first. The MLEs from
the BASE model are used for each subsequent variant: BASE+dinuc_param.
Arguments:
- aln: sequence alignment
- use_monomer_probs: nucleotide freq form when True, Tuple freq form
when False.
- with_CI: whether to estimate parameter confidence intervals, default
is False.
- show_progress: optimiser progress
- max_evaluations: maximum function evaluations for optimiser
"""
assert len(aln.Names) == 3, "Designed for 3 OTUs only"
annot_tree = None # will be created from the baseline model
submod_args = dict(mprob_model='monomer', motif_length=2)
# we make an unrooted tree of the triple
tree = LoadTree(tip_names=aln.Names[:])
count = 0
tables = []
is_first_model=True
for i, (name, params) in enumerate(iterate_specific_dinuc_preds()):
count += 1
submod = Nucleotide(predicates=params, **submod_args)
tr = [annot_tree, tree][annot_tree is None]
lf = submod.makeLikelihoodFunction(tr)
lf.setAlignment(aln)
lf.setName(name)
print "\nWorking on [%d]: %s" % (count, name)
if not opt(lf, global_opt=is_first_model, show_progress=show_progress,
max_evaluations=max_evaluations):
print 'FAIL: Not optimised'
continue
mle_table = get_ci(lf, table_name=name,
with_motif_probs=is_first_model, with_CI=with_CI)
tables += [mle_table]
is_first_model = False
if annot_tree is None:
annot_tree = lf.getAnnotatedTree()
result = tables[0].appended('Model', tables[1:], title='')
return result
def run(align_dir, begin, end, NF, overwrite, with_CI, test_run):
"""the BASE and BASE+dinuc_param model series are iteratively fit to
individual alignments specified in the directory listing in the range
begin-end. Results are written per alignment to matching the
data/some_path/align_name as results/some_path/align_name.txt"""
model_name = ['TF', 'NF'][NF]
result_dir = os.path.join('../results', get_data_subpath(align_dir),
'dinuc_individual', model_name)
create_path(result_dir)
fns = [fn for fn in os.listdir(align_dir) if fn.endswith('.fasta')]
fns = fns[begin: end]
for index, fn in enumerate(fns):
print index
outfile_name = os.path.join(result_dir, fn.replace('.fasta', '.txt'))
if not overwrite and os.path.exists(outfile_name):
print outfile_name
if test_run:
print 'Already done'
continue
aln = LoadSeqs(os.path.join(align_dir, fn), moltype=DNA)
result = fit_model_series(aln, NF, with_CI, test_run)
if test_run:
print result
print outfile_name
break
result.writeToFile(outfile_name, sep="\t")
if __name__ == "__main__":
parser = optparse.OptionParser()
parser.add_option('--dirname', default=None)
parser.add_option("--align_nums",default=None,help="alignment file range")
parser.add_option('--NF', action='store_true', default=False,
help='use the nucleotide frequency weighted model, default is False')
parser.add_option("--overwrite",action='store_true',default=False,
help="overwrite existing results")
parser.add_option("--with_CI", action='store_true',default=False,
help="compute the confidence intervals")
parser.add_option('--test_run', action='store_true', default=False)
options, args = parser.parse_args()
assert None not in (options.dirname,)
b, e = map(int,options.align_nums.split('-'))
run(options.dirname, b, e, options.NF, options.overwrite, options.with_CI,
options.test_run)
print "\n\nDone!"
This module also depends on the utility module (fit_util.py), see previous section for contents.