AF comparison of Gnomad WGS vs WES¶
In [2]:
%%capture
# load the magic extension and imports
%reload_ext nextcode
import pandas as pd
import GOR_query_helper as GQH
import seaborn as sns
import matplotlib as plt
%env LOG_QUERY=1
project = "internal_hg19"
%env GOR_API_PROJECT={project}
In [3]:
qh = GQH.GOR_Query_Helper()
mydefs = ""
In [10]:
mydefs = qh.add_many("""
create #targets# = gor user_data/hakon/gdx_targest.gorz | segproj;
create #gnom_WGS# = pgor user_data/ref_gnomad4/WGS/genome_cols_hg19.gord | join -snpseg -i [#targets#]
| where pos > 0
| where upper(refbases(chrom,pos,pos+len(ref)-1))=ref
| select 1-4,af,an
| varnorm -left ref alt
| granno 1 -gc ref,alt -max -ic AN
| where AN > 0.75 * max_AN
| atmax 1 -gc ref,alt AF;
create #gnom_WES# = pgor user_data/hakon/gnomad_v4.gord | join -snpseg -i [#targets#]
| where pos > 0
| where upper(refbases(chrom,pos,pos+len(ref)-1))=ref
| select 1-4,af,an
| varnorm -left ref alt
| granno 1 -gc ref,alt -max -ic AN
| where AN > 0.75 * max_AN
| atmax 1 -gc ref,alt AF;
create #GDX# = gor ref_gdx/XA/AF/xa_af.gord -f ANY | where ac > 0 | join -snpseg -i [#targets#] | select 1-4,af,an;
create #stat# = pgor <(gor [#gnom_WGS#] | calc source 'gnomWGS'
| merge <(gor [#gnom_WES#] | calc source 'gnomWES')
| merge <(gor [#GDX#] | calc source 'gdx')
)
| group 1 -gc ref,alt -set -sc source -avg -fc af
| calc f ceil(-log(avg_af+1e-8))
| group genome -gc f,set_source -count;
create #a# = nor [#stat#] | group -gc f,set_source -sum -ic allcount;
create #t# = nor [#a#] | calc type 'gdx,gnomWES,gnomWGS' | split type
| where contains(set_source,type) | group -gc f,type -sum -ic sum_allcount
| pivot -v 'gdx','gnomWES','gnomWGS' type -gc f -e 0
| rename (.*)_sum_sum_allcount #{1};
create #o# = nor [#a#] | replace set_source replace(#rc,',','_')
| pivot set_source -v 'gdx','gdx_gnomWES','gdx_gnomWES_gnomWGS','gdx_gnomWGS','gnomWES','gnomWES_gnomWGS','gnomWGS' -gc f -e 0
| rename (.*)_sum_allcount o_#{1};
create #gnom_wgs_wes# = pgor <(gor [#gnom_WGS#]
| merge <(gor [#gnom_WES#] )
| group 1 -gc #3,#4
| varjoin -l -r [#gnom_WGS#] -e 0
| varjoin -l -r [#gnom_WES#] -e 0
| calc f_gnom_wgs round(-log(af+1e-8)*10)/10
| calc f_gnom_wes round(-log(afx+1e-8)*10)/10
| group chrom -gc f_gnom_wgs,f_gnom_wes -count
)
;
create #gnom_wgs_gdx# = pgor <(gor [#gnom_WGS#]
| merge <(gor [#GDX#] )
| group 1 -gc #3,#4
| varjoin -l -r [#gnom_WGS#] -e 0
| varjoin -l -r [#GDX#] -e 0
| calc f_gnom_wgs round(-log(af+1e-8)*10)/10
| calc f_gdx round(-log(afx+1e-8)*10)/10
| group chrom -gc f_gnom_wgs,f_gdx -count
)
;
create #gnom_wes_gdx# = pgor <(gor [#gnom_WES#]
| merge <(gor [#GDX#] )
| group 1 -gc #3,#4
| varjoin -l -r [#gnom_WES#] -e 0
| varjoin -l -r [#GDX#] -e 0
| calc f_gnom_wes round(-log(af+1e-8)*10)/10
| calc f_gdx round(-log(afx+1e-8)*10)/10
| group chrom -gc f_gnom_wes,f_gdx -count
)
;
""")
In [97]:
%%gor df_wgs_wes <<
$mydefs
gor [#gnom_wgs_wes#]
| group genome -gc f_gnom_wgs,f_gnom_wes -sum -ic allcount
| rename sum_allcount var_count
| replace var_count log(var_count+1)
Query ran in 0.36 sec Query fetched 2,657 rows in 0.06 sec (total time 0.42 sec)
In [98]:
%%gor df_wgs_gdx <<
$mydefs
gor [#gnom_wgs_gdx#]
| group genome -gc f_gnom_wgs,f_gdx -sum -ic allcount
| rename sum_allcount var_count
| replace var_count log(var_count+1)
Query ran in 0.18 sec Query fetched 2,849 rows in 0.06 sec (total time 0.24 sec)
In [99]:
%%gor df_wes_gdx <<
$mydefs
gor [#gnom_wes_gdx#]
| group genome -gc f_gnom_wes,f_gdx -sum -ic allcount
| rename sum_allcount var_count
| replace var_count log(var_count+1)
Query ran in 0.19 sec Query fetched 3,330 rows in 0.07 sec (total time 0.26 sec)
In [105]:
af = df_wgs_wes.pivot(index='f_gnom_wgs',columns='f_gnom_wes', values = 'var_count')
ax = sns.heatmap(af)
ax.set_title('GNOMAD-WGS-WES')
Out[105]:
Text(0.5, 1.0, 'GNOMAD-WGS-WES')
In [106]:
af = df_wgs_gdx.pivot(index='f_gnom_wgs',columns='f_gdx', values = 'var_count')
ax = sns.heatmap(af)
ax.set_title('GNOMAD-WGS-GDX')
Out[106]:
Text(0.5, 1.0, 'GNOMAD-WGS-GDX')
In [107]:
af = df_wes_gdx.pivot(index='f_gnom_wes',columns='f_gdx', values = 'var_count')
ax = sns.heatmap(af)
ax.set_title('GNOMAD-WES-GDX')
Out[107]:
Text(0.5, 1.0, 'GNOMAD-WES-GDX')
In [11]:
%%gor
$mydefs
nor [#t#] | map -c f [#o#]
| replace 1- reverse(fsvmap(reverse(#rc),1,"if(mod(i,3)=0 and i<len(#rc),x+',',x)",''))
| sort -c f
Query ran in 2.13 sec Query fetched 9 rows in 11.01 sec (total time 13.14 sec)
Out[11]:
f | gdx | gnomWES | gnomWGS | o_gdx | o_gdx_gnomWES | o_gdx_gnomWES_gnomWGS | o_gdx_gnomWGS | o_gnomWES | o_gnomWES_gnomWGS | o_gnomWGS | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 6 | 4 | 0 | 0 | 0 | 0 | 2 | 4 | 0 |
1 | 1 | 4,842 | 31,665 | 31,817 | 890 | 75 | 3,865 | 12 | 44 | 27,681 | 259 |
2 | 2 | 41,402 | 42,871 | 43,139 | 1,831 | 167 | 39,023 | 381 | 29 | 3,652 | 83 |
3 | 3 | 119,158 | 117,986 | 118,220 | 6,752 | 486 | 111,032 | 888 | 433 | 6,035 | 265 |
4 | 4 | 393,622 | 386,164 | 381,787 | 30,158 | 3,094 | 356,993 | 3,377 | 5,621 | 20,456 | 961 |
5 | 5 | 2,734,239 | 2,697,405 | 2,345,089 | 283,426 | 364,221 | 2,027,874 | 58,718 | 128,859 | 176,451 | 82,046 |
6 | 6 | 9,796,892 | 14,108,649 | 3,689,419 | 4,039,848 | 3,882,480 | 1,567,490 | 307,074 | 7,690,109 | 968,570 | 846,285 |
7 | 7 | 5,397 | 2,861,654 | 33,294 | 0 | 0 | 5,397 | 0 | 2,828,360 | 27,897 | 0 |
8 | 8 | 0 | 4,147,359 | 144,333 | 0 | 0 | 0 | 0 | 4,104,309 | 43,050 | 101,283 |
In [ ]:
%%gor df <<
$mydefs
create #full# = pgor <(gor [#gnom_WGS#]
| merge <(gor [#GDX#] )
| group 1 -gc #3,#4
| varjoin -r [#gnom_WGS#] -e 0
| varjoin -r [#GDX#] -e 0
| rename af af_wgs
| rename afx af_gdx
)
;
gor [#full#]
Query ran in 1.06 sec Working for 0:00:28...
In [ ]:
df.corr('af_wgs','af_gdx')