SMN1 calling from BAM files¶
In [31]:
%%capture
# load the magic extension and imports
%reload_ext nextcode
import pandas as pd
import GOR_query_helper as GQH
%env LOG_QUERY=1
project = "gdx-demo"
%env GOR_API_PROJECT={project}
SMA caller¶
This notebook shows how the code in pipa/sma.py can be implemented in GOR to implement a caller that takes BAM/CRAM input and returns the SMA status.
In [32]:
qh = GQH.GOR_Query_Helper()
mydefs = ""
In [33]:
mydefs = qh.add_many("""
def #SMN1_POS# = '70247773';
def #SMN2_POS# = '69372353';
def #bamfile# = source/samples/2796000-2796999/2796799/DEMO_2796799.bam;
""")
We can fetch the reads needed from a BAM file in several ways. First we do it with two range seek and then with a join of two sites.
In [34]:
%%gor
$mydefs
gor -p chr5:70245773-70249773 #bamfile#
| merge <(gor -p chr5:69370353-69374353 #bamfile#)
Query ran in 0.90 sec Query fetched 475 rows in 0.32 sec (total time 1.22 sec)
Out[34]:
Chromo | Pos | End | QName | Flag | MapQ | Cigar | MD | MRNM | MPOS | ISIZE | SEQ | QUAL | TAG_VALUES | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chr5 | 69370484 | 69370633 | LH00291:112:22FNNFLT3:3:2164:49219:7297 | 163 | 60 | 150M | NaN | chr5 | 69370578 | 245 | TAAGGTATAAGCGGGCTCAGGAACATCATTGGACATACTGAAAGAA... | =;<?4>???@DD@BBDCB2DBAABB<CCAAECBCCAACDEBBBEBB... | XA=chr5,+70245909,150M,3; MC=151M BD=LLMNMMNLM... |
1 | chr5 | 69370490 | 69370639 | LH00291:112:22FNNFLT3:6:2186:32418:3243 | 163 | 60 | 150M | NaN | chr5 | 69370588 | 249 | ATAAGCGGGCTCAGGAACATCATTGGACATACTGAAAGAAGAAAAA... | =;;<AB?AACCBBDBAABB@BB@@DCBCCAACDEBBBEBBEBBBBB... | XA=chr5,+70245915,150M,2; MC=151M BD=LLNMLNLLD... |
2 | chr5 | 69370578 | 69370728 | LH00291:112:22FNNFLT3:3:2164:49219:7297 | 83 | 60 | 151M | NaN | chr5 | 69370484 | -245 | GGGAGGCCAAGGCAGGCGAATCACCTGAAGTCGGGAGTTCCAGATC... | >=*??*BD,DCEEDCEACAABEACECDBECCADDDEDCCDFDCBCF... | XA=chr5,-70246003,151M,2; MC=150M BD=JMMNOOONN... |
3 | chr5 | 69370588 | 69370738 | LH00291:112:22FNNFLT3:6:2186:32418:3243 | 83 | 60 | 151M | NaN | chr5 | 69370490 | -249 | GGCAGGCGAATCACCTGAAGTCGGGAGTTCCAGATCAGCCTGACCA... | >????C@B@ABEACECCADCBACCCDDCCCFDDBCFEFDFDCADFB... | XA=chr5,-70246013,151M,3; MC=150M BD=OOONOOOML... |
4 | chr5 | 69371122 | 69371271 | LH00291:112:22FNNFLT3:4:1278:22340:17182 | 99 | 0 | 150M | NaN | chr5 | 69371167 | 196 | GCTCTGCCTCCCAGGTTCACACCATTCTCTTGCCTCAGCCTCCCGA... | >@?>ACDBCBCCCECA@CCCBBBCAACDDDAFFDEDDFFCE6DDAB... | XA=chr5,+70246547,150M,0; MC=151M BD=LLOMNPPPN... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
470 | chr5 | 70248294 | 70248443 | LH00291:112:22FNNFLT3:7:1194:11624:22454 | 83 | 0 | 150M | NaN | chr5 | 70248112 | -332 | GACACCACTAAAGAAACGATCAGACAGATCTGGAATGTGAAGCGTT... | =;@=AD@DAAADCAAAACABEDCAEEDBBFCDDBBDDDDBDEBDCA... | MC=151M BD=OJKNOLPPNENNMEMNNOONOMOJOMOONOMMLKM... |
471 | chr5 | 70248387 | 70248537 | LH00291:112:22FNNFLT3:3:1129:4208:11495 | 81 | 8 | 151M | NaN | chr5 | 70247927 | -611 | AGAAAAAAGGAAGTGGAATGGGTAACTCTTCTTGATTAAAAGTTAT... | ?*;,=?@CBCADCCCC,ACC)C+AAEC1CBFB+DBCBBBBE+BBBD... | XA=chr5,-69372967,151M,1; MC=150M BD=MLDDDENON... |
472 | chr5 | 70249023 | 70249128 | LH00291:112:22FNNFLT3:3:1179:29543:4733 | 163 | 49 | 106M | NaN | chr5 | 70249023 | 106 | GCCCACGCTGGAGTGCAGTGGTGCAATCTCAGCTCACTGCAACCTC... | 0?==?@?CBCBAD@DDBD@D5@DDBBACDCCEEDCCCDEECBCCDC... | XA=chr5,+69373603,106M,2; MC=106M BD=LLOJKILMN... |
473 | chr5 | 70249023 | 70249128 | LH00291:112:22FNNFLT3:3:1179:29543:4733 | 83 | 49 | 106M | NaN | chr5 | 70249023 | -106 | GCCCACGCTGGAGTGCAGTGGTGCAATCTCAGCTCACTGCAACCTC... | FCDFBBFF,DDECDF6E+DDDDF6BB/FCFEF2/FB2DF6BADFC3... | XA=chr5,-69373603,106M,2; MC=106M BD=NENKNMOOM... |
474 | chr5 | 70249703 | 70249853 | LH00291:112:22FNNFLT3:2:1202:23329:25017 | 99 | 0 | 151M | NaN | chr5 | 70249794 | 242 | ACAGAAAACATCAAAGCCACCACACTACCTGGCTGGATTTATCCTA... | >>>@?@AABBACCBBEDCCCBBBCCDACDDEDFEFDCBBABBDDDA... | XA=chr5,+69374283,151M,0; MC=151M BD=LLJOOMEEL... |
475 rows × 14 columns
and now with the join method that is more easily extended to multi-site calling:
In [35]:
%%gor
$mydefs
create #sma_pos# = norrows 1 | calc chrom 'chr5' | calc pos '69372353,70247773' | split pos | select chrom,pos;
gor [#sma_pos#] | join -snpsnp -ir -f 2000 #bamfile#
Query ran in 0.18 sec Query fetched 475 rows in 0.74 sec (total time 0.93 sec)
Out[35]:
Chromo | Pos | End | QName | Flag | MapQ | Cigar | MD | MRNM | MPOS | ISIZE | SEQ | QUAL | TAG_VALUES | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chr5 | 69370484 | 69370633 | LH00291:112:22FNNFLT3:3:2164:49219:7297 | 163 | 60 | 150M | NaN | chr5 | 69370578 | 245 | TAAGGTATAAGCGGGCTCAGGAACATCATTGGACATACTGAAAGAA... | =;<?4>???@DD@BBDCB2DBAABB<CCAAECBCCAACDEBBBEBB... | XA=chr5,+70245909,150M,3; MC=151M BD=LLMNMMNLM... |
1 | chr5 | 69370490 | 69370639 | LH00291:112:22FNNFLT3:6:2186:32418:3243 | 163 | 60 | 150M | NaN | chr5 | 69370588 | 249 | ATAAGCGGGCTCAGGAACATCATTGGACATACTGAAAGAAGAAAAA... | =;;<AB?AACCBBDBAABB@BB@@DCBCCAACDEBBBEBBEBBBBB... | XA=chr5,+70245915,150M,2; MC=151M BD=LLNMLNLLD... |
2 | chr5 | 69370578 | 69370728 | LH00291:112:22FNNFLT3:3:2164:49219:7297 | 83 | 60 | 151M | NaN | chr5 | 69370484 | -245 | GGGAGGCCAAGGCAGGCGAATCACCTGAAGTCGGGAGTTCCAGATC... | >=*??*BD,DCEEDCEACAABEACECDBECCADDDEDCCDFDCBCF... | XA=chr5,-70246003,151M,2; MC=150M BD=JMMNOOONN... |
3 | chr5 | 69370588 | 69370738 | LH00291:112:22FNNFLT3:6:2186:32418:3243 | 83 | 60 | 151M | NaN | chr5 | 69370490 | -249 | GGCAGGCGAATCACCTGAAGTCGGGAGTTCCAGATCAGCCTGACCA... | >????C@B@ABEACECCADCBACCCDDCCCFDDBCFEFDFDCADFB... | XA=chr5,-70246013,151M,3; MC=150M BD=OOONOOOML... |
4 | chr5 | 69371122 | 69371271 | LH00291:112:22FNNFLT3:4:1278:22340:17182 | 99 | 0 | 150M | NaN | chr5 | 69371167 | 196 | GCTCTGCCTCCCAGGTTCACACCATTCTCTTGCCTCAGCCTCCCGA... | >@?>ACDBCBCCCECA@CCCBBBCAACDDDAFFDEDDFFCE6DDAB... | XA=chr5,+70246547,150M,0; MC=151M BD=LLOMNPPPN... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
470 | chr5 | 70248294 | 70248443 | LH00291:112:22FNNFLT3:7:1194:11624:22454 | 83 | 0 | 150M | NaN | chr5 | 70248112 | -332 | GACACCACTAAAGAAACGATCAGACAGATCTGGAATGTGAAGCGTT... | =;@=AD@DAAADCAAAACABEDCAEEDBBFCDDBBDDDDBDEBDCA... | MC=151M BD=OJKNOLPPNENNMEMNNOONOMOJOMOONOMMLKM... |
471 | chr5 | 70248387 | 70248537 | LH00291:112:22FNNFLT3:3:1129:4208:11495 | 81 | 8 | 151M | NaN | chr5 | 70247927 | -611 | AGAAAAAAGGAAGTGGAATGGGTAACTCTTCTTGATTAAAAGTTAT... | ?*;,=?@CBCADCCCC,ACC)C+AAEC1CBFB+DBCBBBBE+BBBD... | XA=chr5,-69372967,151M,1; MC=150M BD=MLDDDENON... |
472 | chr5 | 70249023 | 70249128 | LH00291:112:22FNNFLT3:3:1179:29543:4733 | 163 | 49 | 106M | NaN | chr5 | 70249023 | 106 | GCCCACGCTGGAGTGCAGTGGTGCAATCTCAGCTCACTGCAACCTC... | 0?==?@?CBCBAD@DDBD@D5@DDBBACDCCEEDCCCDEECBCCDC... | XA=chr5,+69373603,106M,2; MC=106M BD=LLOJKILMN... |
473 | chr5 | 70249023 | 70249128 | LH00291:112:22FNNFLT3:3:1179:29543:4733 | 83 | 49 | 106M | NaN | chr5 | 70249023 | -106 | GCCCACGCTGGAGTGCAGTGGTGCAATCTCAGCTCACTGCAACCTC... | FCDFBBFF,DDECDF6E+DDDDF6BB/FCFEF2/FB2DF6BADFC3... | XA=chr5,-69373603,106M,2; MC=106M BD=NENKNMOOM... |
474 | chr5 | 70249703 | 70249853 | LH00291:112:22FNNFLT3:2:1202:23329:25017 | 99 | 0 | 151M | NaN | chr5 | 70249794 | 242 | ACAGAAAACATCAAAGCCACCACACTACCTGGCTGGATTTATCCTA... | >>>@?@AABBACCBBEDCCCBBBCCDACDDEDFEFDCBBABBDDDA... | XA=chr5,+69374283,151M,0; MC=151M BD=LLJOOMEEL... |
475 rows × 14 columns
Now we can use PILEUP to call the SNPs of interest:
In [36]:
%%gor
$mydefs
create #sma_pos# = norrows 1 | calc chrom 'chr5' | calc pos '69372353,70247773' | split pos | select chrom,pos;
gor [#sma_pos#] | join -snpsnp -ir -f 2000 #bamfile#
| pileup -gt | where SNP = 1
Query ran in 0.14 sec Query fetched 4 rows in 0.80 sec (total time 0.93 sec)
Out[36]:
Chrom | Pos | RefBase | MajorAllele | SecondAllele | Chi | Depth | Adepth | Cdepth | Gdepth | Tdepth | Dels | Ins | GT | pGT | LOD | GT2 | SNP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chr5 | 69373667 | a | G | G | 0.0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | G | 0.3719 | 0.3 | R | 1 |
1 | chr5 | 70245876 | t | C | C | 0.0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | C | 0.3774 | 0.3 | M | 1 |
2 | chr5 | 70247901 | T | T | G | 4.7 | 40 | 0 | 0 | 11 | 29 | 0 | 0 | K | 1.0000 | 27.8 | T | 1 |
3 | chr5 | 70249068 | g | C | C | 0.0 | 2 | 0 | 2 | 0 | 0 | 0 | 0 | C | 0.5699 | 0.6 | M | 1 |
To get only our locations of interest we simply intersect the pileup output:
In [38]:
%%gor
$mydefs
create #sma_pos# = norrows 1 | calc chrom 'chr5' | calc pos '69372353,70247773' | split pos | select chrom,pos;
gor [#sma_pos#] | join -snpsnp -ir -f 2000 #bamfile#
| pileup -gt | join -snpsnp [#sma_pos#]
Query ran in 0.21 sec Query fetched 2 rows in 0.87 sec (total time 1.09 sec)
Out[38]:
Chrom | Pos | RefBase | MajorAllele | SecondAllele | Chi | Depth | Adepth | Cdepth | Gdepth | Tdepth | Dels | Ins | GT | pGT | LOD | GT2 | SNP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chr5 | 69372353 | T | T | T | 0.0 | 9 | 0 | 0 | 0 | 9 | 0 | 0 | T | 1.0 | 5.6 | W | 0 |
1 | chr5 | 70247773 | C | C | C | 0.0 | 44 | 0 | 44 | 0 | 0 | 0 | 0 | C | 1.0 | 16.2 | M | 0 |
and out final logic
In [47]:
mydefs = qh.add_many("""
def #base_quality_threshold# = 20;
def #smn1_count_threshold# = 10;
def #ratio_threshold# = 0.05;
def #total_count_threshold# = 10;
create #sma_pos# = norrows 1 | calc chrom 'chr5' | calc pos '69372353,70247773' | split pos | select chrom,pos;
create #pile# = gor [#sma_pos#] | join -snpsnp -ir -f 2000 #bamfile#
| pileup -gt -bq #base_quality_threshold#
| join -snpsnp [#sma_pos#];
create #output# = nor [#pile#] | calc type if(pos = 69372353, 'smn2','smn1')
| select type,depth-Tdepth
| pivot -v 'smn1','smn2' type
| calc c_count_total = smn1_Cdepth + smn2_Cdepth
| calc t_count_total = smn2_Tdepth + smn2_Tdepth
| calc output if(c_count_total + t_count_total < #total_count_threshold# ,'SMN1/2 counts lower than threshold',
if(t_count_total = 0,'division by zero',
if(smn1_Cdepth <= #smn1_count_threshold# and c_count_total / t_count_total <= #ratio_threshold#,'hom call: '+str(c_count_total),'no call: '+str(c_count_total/ t_count_total))))
| select output;
""")
In [48]:
%%gor
$mydefs
nor [#output#]
Query ran in 1.32 sec Query fetched 1 rows in 0.00 sec (total time 1.32 sec)
Out[48]:
output | |
---|---|
0 | no call: 2.75 |
Here is the full code and for comparison the sma.py code below.
In [49]:
print(mydefs)
def #SMN1_POS# = '70247773'; def #SMN2_POS# = '69372353'; def #bamfile# = source/samples/2796000-2796999/2796799/DEMO_2796799.bam; def #base_quality_threshold# = 20; def #smn1_count_threshold# = 10; def #ratio_threshold# = 0.05; def #total_count_threshold# = 10; create #sma_pos# = norrows 1 | calc chrom 'chr5' | calc pos '69372353,70247773' | split pos | select chrom,pos; create #pile# = gor [#sma_pos#] | join -snpsnp -ir -f 2000 #bamfile# | pileup -gt -bq #base_quality_threshold# | join -snpsnp [#sma_pos#]; create #output# = nor [#pile#] | calc type if(pos = 69372353, 'smn2','smn1') | select type,depth-Tdepth | pivot -v 'smn1','smn2' type | calc c_count_total = smn1_Cdepth + smn2_Cdepth | calc t_count_total = smn2_Tdepth + smn2_Tdepth | calc output if(c_count_total + t_count_total < #total_count_threshold# ,'SMN1/2 counts lower than threshold', if(t_count_total = 0,'division by zero', if(smn1_Cdepth <= #smn1_count_threshold# and c_count_total / t_count_total <= #ratio_threshold#,'hom call: '+str(c_count_total),'no call: '+str(c_count_total/ t_count_total)))) | select output;
In [ ]:
#!/mnt/fb1/ProdBin/pipa/pyVE/pipa/bin/python
# Python script to call presence of SMN1 gene.
# Can be tested by running:
# $ pip install -r requirements.txt
# $ python pipeline/sma.py call \
# --case FAM000001 \
# --reference /mnt/hnas/reference/hg19/hg19.fa \
# tests/data/alignments/1842535.smn.bam \
# tests/data/alignments/1842014.smn.bam
import fire
import logging
import pysam
import subprocess
import sys
import vcf
from itertools import chain
# create logger
logger = logging.getLogger('sma')
logger.setLevel(logging.DEBUG)
# create console handler
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter and add it to the handler
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# add the handler to the logger
logger.addHandler(ch)
# SMN1/2 differentiating positions
SMN1_POS = {'chrom': '5', 'pos': 70247773}
SMN2_POS = {'chrom': '5', 'pos': 69372353}
# SMN1 primary transcript
SMN1_TRANSCRIPT = 'NM_000344'
SMN1_EXON_7_LENGTH = 498
# call decision map
CALL_MAP = {
True: 'hom',
False: 'wt',
None: 'nc',
}
def _run_mpileup(samtools, sample, alignment, reference, position, base_quality_threshold, verbosity):
output = f'{sample}.{position}.mpileup.vcf'
# run samtools mpileup from differentiating positions
cmd = [
samtools,
'mpileup',
'-v',
'-u',
'-q',
f'{base_quality_threshold}',
'-Q',
'-O',
'-s',
'-B',
'-I',
'-R',
'-t',
'AD',
'-t',
'DP',
'-t',
'SP',
'-f',
f'{reference}',
'-r',
f'{position}',
'-o',
f'{output}',
alignment,
]
# run cromwell command
if verbosity:
logger.info('Running samtools mpileup command:')
logger.info(cmd)
try:
subprocess.run(
cmd,
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
except subprocess.CalledProcessError as e:
log_err = f'{sample}.{position}.mpileup.err.log'
with open(log_err, 'w') as fh:
fh.write(str(e.stderr))
log_out = f'{sample}.{position}.mpileup.out.log'
with open(log_out, 'w') as fh:
fh.write(str(e.stdout))
logger.critical('Failed to run samtools mpileup. See logs:')
logger.critical(log_out)
logger.critical(log_err)
exit(e.returncode)
return output
def _get_counts(vcf_file, alt_base):
"""Given a VCF file name and alternate base,
returns the ref and alt counts
:param vcf_file: str of full path to vcf file name (required)
:param alt_base: str of alt base of interest, case sensitive (required)
"""
vcf_reader = vcf.Reader(filename=vcf_file)
try:
record = next(vcf_reader)
except StopIteration:
# assume we have no coverage if we got an empty vcf
return 0, 0, 0
try:
sample_call = list(record)[0]
except IndexError:
# assume we have no coverage if we got a vcf with no sample calls
return 0, 0, 0
total_count = sample_call.data.DP
ref_count = sample_call.data.AD[0]
alts = record.ALT
if alt_base in alts:
alt_pos = alts.index(alt_base) + 1
alt_count = sample_call.data.AD[alt_pos]
else:
alt_count = 0
return ref_count, alt_count, total_count
def _make_call(samtools, alignment, alignment_type, reference, smn1_count_threshold, base_quality_threshold, ratio_threshold, verbosity, total_count_threshold=10):
# open alignment file
try:
if alignment_type == 'bam':
sam_file = pysam.AlignmentFile(alignment, 'rb')
else:
sam_file = pysam.AlignmentFile(alignment, 'rc')
except (FileNotFoundError, PermissionError) as e:
logger.error('Cannot open alignment file. %s' % e)
return ['NA' for _ in range(5)]
try:
sample = sam_file.header['RG'][0]['SM']
except KeyError:
logger.error('Failed to get sample from alignment header.')
return ['NA' for _ in range(5)]
sam_file.close()
smn1_pileup_vcf = _run_mpileup(
samtools,
sample,
alignment,
reference,
f"chr{SMN1_POS['chrom']}:{SMN1_POS['pos']}-{SMN1_POS['pos']}",
base_quality_threshold,
verbosity,
)
smn2_pileup_vcf = _run_mpileup(
samtools,
sample,
alignment,
reference,
f"chr{SMN2_POS['chrom']}:{SMN2_POS['pos']}-{SMN2_POS['pos']}",
base_quality_threshold,
verbosity,
)
c_count_smn1, t_count_smn1, total_count_smn1 = _get_counts(smn1_pileup_vcf, 'T')
t_count_smn2, c_count_smn2, _ = _get_counts(smn2_pileup_vcf, 'C')
c_count_total = c_count_smn1 + c_count_smn2
t_count_total = t_count_smn1 + t_count_smn2
# make the homozygous call if we have a set count of SMN2 reads
# and the SMN1/SMN2 ratio is below the threshold
if c_count_total + t_count_total < total_count_threshold:
logger.info(f"{sample}: c_count_smn1:{c_count_smn1}, t_count_smn1:{t_count_smn1}")
logger.info(f"{sample}: c_count_smn2:{c_count_smn2}, t_count_smn2:{t_count_smn2}")
logger.info(f"{sample}: c_count_total:{c_count_total}, t_count_total:{t_count_total}")
logger.warning(f"{sample} has a total of SMN1/2 counts lower than threshold of {total_count_threshold}")
hom_call = None
else:
try:
hom_call = c_count_smn1 <= smn1_count_threshold and c_count_total / t_count_total <= ratio_threshold
except ZeroDivisionError:
# make a no_call for no-coverage smn2 samples
hom_call = None
# add call for this sample
return [
str(sample),
CALL_MAP[hom_call],
'99',
str(c_count_total),
'-1' if hom_call else '0',
]
def call(case, *alignments, alignments_list=None, alignment_type='bam', reference=None, smn1_count_threshold=10, ratio_threshold=0.05,
base_quality_threshold=20, split_calls=False, skip_wt=False, samtools='samtools', verbosity=0):
"""Calls total absence of a working copy of SMN1 gene linked to SMA.
:param case: Case identifier.
:param alignments: Alignment files.
:param alignments_list: Optional alignment list file. Ignored if positional alignments are provided.
:param alignment_type: alignment input file type, valid values are 'bam' or 'cram'. [default: 'bam']
:param reference: Reference fasta.
:param smn1_count_threshold: Minimum number of reads covering SMN2 differentiating position. [default: 10]
:param ratio_threshold: SMN1/SMN2 ratio cutoff. Assumes no SMN1 below this number. [default: 0.05]
:param base_quality_threshold: Read base quality threshold to use for samtools mpileup. [default: 20]
:param split_calls: If true, outputs one sample call per line. [default: false]
:param skip_wt: If true, doesn't output lines with all wild-type calls. [default: false]
:param samtools: Samtools executable path. [default: samtools]
:param verbosity: Verbose level. [default: 0]
"""
if reference is None:
logger.critical('Please provide reference fasta, see <--reference>')
exit(1)
if not alignments and alignments_list is None:
logger.critical(
'Please provide alignments files or a list of alignments, see <alignments> or <--alignments-list>'
)
exit(1)
elif not alignments:
try:
with open(alignments_list, 'r') as fh:
alignments = fh.read().splitlines()
except (FileNotFoundError, PermissionError) as e:
logger.critical('Cannot open alignments list file. %s' % e)
exit(1)
# output calls in GeneDx CNV format
# start headers
headers = [
'#case',
'genes',
'exons',
'cnv_type',
'transcript',
'chromosome',
'start',
'length',
]
# number of sample columns to output, use just one if asked to split calls
samples_number = 1 if split_calls else len(alignments)
# extend headers for how many sample calls we have
for i in range(1, samples_number + 1):
headers.extend(
['%s_%d' % (header, i) for header in [
"sample",
"genotype",
"score",
"avg_coverage",
"avg_deviance",
]]
)
# write headers
sys.stdout.write('\t'.join(headers))
sys.stdout.write('\n')
# fill common columns
common_columns = [
str(case),
'SMN1',
'ex7',
'CNV del',
SMN1_TRANSCRIPT,
SMN1_POS['chrom'],
str(SMN1_POS['pos']),
str(SMN1_EXON_7_LENGTH),
]
# if asked print one sample call per line
if split_calls:
for alignment in alignments:
c = _make_call(
samtools,
alignment,
alignment_type,
reference,
smn1_count_threshold,
base_quality_threshold,
ratio_threshold,
verbosity,
)
# skip calls where we couldn't even get the sample from input
if c[0] == 'NA':
continue
# if asked skip wild-type calls
if skip_wt and c[1] == 'wt':
continue
sys.stdout.write('\t'.join(common_columns + c))
sys.stdout.write('\n')
# else print all samples calls in one line
else:
calls = []
for alignment in alignments:
calls.append(
_make_call(
samtools,
alignment,
alignment_type,
reference,
smn1_count_threshold,
base_quality_threshold,
ratio_threshold,
verbosity,
)
)
# if asked to skip wild-type calls, only output call lines where at least on sample is not wt
if not skip_wt or any([c[1] != 'wt' for c in calls]):
sys.stdout.write('\t'.join(common_columns + list(chain(*calls))))
sys.stdout.write('\n')
def main():
fire.Fire({
'call': call,
})
if __name__ == '__main__':
main()
In [ ]: