Comparative Genomics

data/sormaize.png

Arabidopsis Thaliana/Lyrata

data/dotplot.png

Rtree

>>> from rtree import Rtree
>>> from biostuff import BlastLine
>>> r = Rtree('ex.rtree')
>>> for i, line in enumerate(open('data/tabd.blast')):
...     b = BlastLine(line)
...     smin, smax = sorted([b.sstart, b.sstop])
...     #     id, (xmin, ymin, xmax, ymax), object
...     r.add(i, (b.qstart, smin, b.qstop, smax), b)
>>> del r # flush for next block.

# somewhere else.
>>> from rtree import Rtree
>>> r = Rtree('ex.rtree')
>>> r.intersection((44384798, 9693297, 44384798, 9693297), objects="raw")
[BlastLine(chr6[44384798:44387498]-10[9693297:9696007], ptcid=93.540, eval=0.000)]

GenomeTools

  • CLI for GFF feature extraction and filtering, LTR, suffixerator, etc.
  • C API with bindings for Lua, Python, Ruby.
data/genometools.png

pyfasta: pythonic access to fasta sequence files

>>> from pyfasta import Fasta
>>> f = Fasta('data/three_chrs.fasta')
>>> sorted(f.keys())
['chr1', 'chr2', 'chr3']

>>> f['chr1'][:5] # normal python slicing.
'ACTGA'

>>> f['chr1'][2::3] # last basepair in every codon.
'TCAGTCAGTCAGTCAGTCAGTCAGTC'

>>> import numpy as np
>>> a = np.array(f['chr1'])
>>> a.dtype, a.shape
(dtype('|S1'), (80,))

>>> a[a == 'C'] = np.array('N', dtype='S1') # mask cytosines
>>> a[:10].tostring()
'ANTGANTGAN'