Run Gregor studies in batch¶
In [1]:
%%capture
# load the magic extension and imports
%reload_ext nextcode
import pandas as pd
import GOR_query_helper as GQH
%env LOG_QUERY=1
project = "test-hg19"
%env GOR_API_PROJECT={project}
In [2]:
import nextcode
# svc_queryserver = nextcode.get_service("queryserver")
svc_query = nextcode.get_service("query")
In [3]:
alljobs = []
def initialize_jobs():
global alljobs
alljobs = []
def add_job(id):
alljobs.append(id)
def print_job_statuses():
for id in alljobs:
job = svc_query.get_query(id)
print(f"status of {id} is "+job.status)
def cancel_all_jobs():
for id in alljobs:
job = svc_query.get_query(id)
if job.status == 'RUNNING' or job.status == 'PENDING':
job.cancel()
In [4]:
%%gor manifest <<
nor SubjectReports/Participants.rep.link
| select pn,kind,affected,sex,study_name
| replace affected if(affected='yes','affected','unaffected')
| pivot -gc study_name kind -v index,father,mother -e ''
| hide index_affected,father_sex,mother_sex
Query ran in 0.68 sec Query fetched 692 rows in 0.07 sec (total time 0.75 sec)
In [50]:
manifest
Out[50]:
study_name | index_pn | index_sex | father_pn | father_affected | mother_pn | mother_affected | |
---|---|---|---|---|---|---|---|
0 | GCA825341 | TEST_2941719 | female | NaN | NaN | NaN | NaN |
1 | GCA802054 | TEST_2909078 | male | NaN | NaN | NaN | NaN |
2 | GCA863723 | TEST_2992440 | female | NaN | NaN | NaN | NaN |
3 | GCA820675 | TEST_2936504 | female | NaN | NaN | NaN | NaN |
4 | GCA833767 | TEST_2953629 | female | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... |
687 | GCA852666 | TEST_2977861 | male | NaN | NaN | NaN | NaN |
688 | GCA792495 | TEST_2901892 | male | TEST_2895471 | unaffected | TEST_2901923 | unaffected |
689 | GCA817697 | TEST_2925474 | female | NaN | NaN | TEST_2919688 | unaffected |
690 | GCA819745 | TEST_2931900 | male | TEST_2931924 | unaffected | TEST_2931915 | unaffected |
691 | GCA855762 | TEST_2983016 | female | TEST_2983018 | unaffected | TEST_2983017 | unaffected |
692 rows × 7 columns
In [2]:
initialize_jobs()
def NaN2E(x):
if str(x) == 'nan' or str(x) == 'NaN': return ''
else: return x
import datetime
print(datetime.datetime.now())
for i in range(0,len(manifest)):errrrrrrrorrrrrhere
#print(f"{manifest.at[i,'study_name']}")
gregor_queries = f"""def ##case_id## = {manifest.at[i,'study_name']};
def ##index_case## = '{manifest.at[i,'index_pn']}';
def ##father## = '{NaN2E(manifest.at[i,'father_pn'])}';
def ##father_affstat## = '{NaN2E(manifest.at[i,'father_affected'])}';
def ##mother## = '{NaN2E(manifest.at[i,'mother_pn'])}';
def ##mother_affstat## = '{NaN2E(manifest.at[i,'mother_affected'])}';
def ##index_gender## = '{manifest.at[i,'index_sex']}';
def ##cand_genes## = studies/##case_id##/candidate_genes.gor;
def ##gregor1_yaml## = user_data/hakon/YML/gregor_step1.ftl.yml;
def ##gregor2_yaml## = user_data/hakon/YML/gregor_step2.ftl.yml;
def ##gregor_mimi_yaml## = user_data/hakon/YML/gregor_mimi.ftl.yml;
def ##gregor_gene_yaml## = user_data/hakon/YML/gregor_gene_overview.ftl.yml;
def ##gor_or_pgor## = gor;
create #greg1# = gor ##gregor1_yaml##(index_case = ##index_case##, gender = ##index_gender##, mother= ##mother##, father = ##father##, gor_or_pgor = ##gor_or_pgor##)
| signature -file ##gregor1_yaml##;
create #greg2# = gor ##gregor2_yaml##(index_case = ##index_case##, mother= ##mother##, father = ##father##, prepopulated_result = [#greg1#], study_path = studies/##case_id##
, output_table = '##gregor2_result##')
| columnreorder chrom,pos,reference,call,Gene_symbol,gt_info,zygosity_proband*,zygosity_mother*,zygosity_father*,callratio_*,annotation,gmb_disease_info,gmb_disease_status,sequence_variant,VEP_Max_Consequence,internal_af,gnomad_af,homCount,homCount_info,GDX_classification,ext_classification,cnv_overlap,cnv_overlap_info,gene_hpocodes,gene_highrank,gene_cov_10x;
create #w1# = gor [#greg1#] | write s3data://shared/studies/##case_id##/tr/gregor_step1.gorz;
create #w2# = gor [#greg2#] | write studies/##case_id##/tr/gregor_step2.gorz;
gor [#greg2#] | top 1
"""
job = svc_query.execute(gregor_queries,job_type="lord",nowait="true")
id = job.id
print(f"""study {manifest.at[i,'study_name']}, no {i}, job {id}""")
print(gregor_queries)
add_job(id)
print(datetime.datetime.now())
Cell In[2], line 12 gregor_queries = f"""def ##case_id## = {manifest.at[i,'study_name']}; ^ IndentationError: unexpected indent
In [60]:
print_job_statuses()
status of 102880 is DONE status of 102881 is DONE status of 102882 is DONE status of 102883 is DONE status of 102884 is DONE status of 102885 is DONE status of 102886 is DONE status of 102887 is DONE status of 102888 is DONE status of 102889 is DONE status of 102890 is DONE status of 102891 is CANCELLED status of 102892 is CANCELLED status of 102893 is DONE status of 102894 is CANCELLED status of 102895 is DONE status of 102896 is CANCELLED status of 102897 is DONE status of 102898 is CANCELLED status of 102899 is CANCELLED status of 102900 is DONE status of 102901 is DONE status of 102902 is DONE status of 102903 is DONE status of 102904 is DONE status of 102905 is CANCELLED status of 102906 is DONE status of 102907 is DONE status of 102908 is DONE status of 102909 is DONE status of 102910 is DONE status of 102911 is DONE status of 102912 is CANCELLED status of 102913 is DONE status of 102914 is DONE status of 102915 is DONE status of 102916 is CANCELLED status of 102917 is FAILED status of 102918 is CANCELLED status of 102919 is DONE status of 102920 is CANCELLED status of 102921 is CANCELLED status of 102922 is CANCELLED status of 102923 is CANCELLED status of 102924 is CANCELLED status of 102925 is CANCELLED status of 102926 is CANCELLED status of 102927 is CANCELLED status of 102928 is CANCELLED status of 102929 is CANCELLED status of 102930 is DONE status of 102931 is CANCELLED status of 102932 is CANCELLED status of 102933 is DONE status of 102934 is CANCELLED status of 102935 is CANCELLED status of 102936 is DONE status of 102937 is CANCELLED status of 102938 is CANCELLED status of 102939 is DONE status of 102940 is CANCELLED status of 102941 is DONE status of 102942 is DONE status of 102943 is CANCELLED status of 102944 is DONE status of 102945 is DONE status of 102946 is CANCELLED status of 102947 is CANCELLED status of 102948 is CANCELLED status of 102949 is DONE status of 102950 is CANCELLED status of 102951 is DONE status of 102952 is DONE status of 102953 is DONE status of 102954 is DONE status of 102955 is CANCELLED status of 102956 is CANCELLED status of 102957 is CANCELLED status of 102958 is CANCELLED status of 102959 is DONE status of 102960 is DONE status of 102961 is CANCELLED status of 102962 is DONE status of 102963 is CANCELLED status of 102964 is DONE status of 102965 is CANCELLED status of 102966 is DONE status of 102967 is DONE status of 102968 is CANCELLED status of 102969 is CANCELLED status of 102970 is CANCELLED status of 102971 is CANCELLED status of 102972 is CANCELLED status of 102973 is DONE status of 102974 is DONE status of 102975 is CANCELLED status of 102976 is CANCELLED status of 102977 is DONE status of 102978 is CANCELLED status of 102979 is CANCELLED status of 102980 is CANCELLED status of 102981 is CANCELLED status of 102982 is CANCELLED status of 102983 is CANCELLED status of 102984 is DONE status of 102985 is CANCELLED status of 102986 is CANCELLED status of 102987 is CANCELLED status of 102988 is CANCELLED status of 102989 is CANCELLED status of 102990 is DONE status of 102991 is CANCELLED status of 102992 is CANCELLED status of 102993 is CANCELLED status of 102994 is CANCELLED status of 102995 is CANCELLED status of 102996 is CANCELLED status of 102997 is DONE status of 102998 is CANCELLED status of 102999 is CANCELLED status of 103000 is CANCELLED status of 103001 is DONE status of 103002 is CANCELLED status of 103003 is CANCELLED status of 103004 is CANCELLED status of 103005 is CANCELLED status of 103006 is DONE status of 103007 is CANCELLED status of 103008 is CANCELLED status of 103009 is DONE status of 103010 is CANCELLED status of 103011 is DONE status of 103012 is CANCELLED status of 103013 is CANCELLED status of 103014 is CANCELLED status of 103015 is CANCELLED status of 103016 is DONE status of 103017 is DONE status of 103018 is DONE status of 103019 is CANCELLED status of 103020 is DONE status of 103021 is CANCELLED status of 103022 is CANCELLED status of 103023 is CANCELLED status of 103024 is CANCELLED status of 103025 is CANCELLED status of 103026 is DONE status of 103027 is CANCELLED status of 103028 is CANCELLED status of 103029 is CANCELLED status of 103030 is CANCELLED status of 103031 is CANCELLED status of 103032 is CANCELLED status of 103033 is CANCELLED status of 103034 is DONE status of 103035 is DONE status of 103036 is CANCELLED status of 103037 is DONE status of 103038 is CANCELLED status of 103039 is CANCELLED status of 103040 is CANCELLED status of 103041 is DONE status of 103042 is CANCELLED status of 103043 is CANCELLED status of 103044 is CANCELLED status of 103045 is CANCELLED status of 103046 is DONE status of 103047 is CANCELLED status of 103048 is CANCELLED status of 103049 is CANCELLED status of 103050 is DONE status of 103051 is CANCELLED status of 103052 is DONE status of 103053 is CANCELLED status of 103054 is DONE status of 103055 is CANCELLED status of 103056 is CANCELLED status of 103057 is DONE status of 103058 is DONE status of 103059 is CANCELLED status of 103060 is CANCELLED status of 103061 is DONE status of 103062 is CANCELLED status of 103063 is DONE status of 103064 is CANCELLED status of 103065 is DONE status of 103066 is CANCELLED status of 103067 is CANCELLED status of 103068 is DONE status of 103069 is CANCELLED status of 103070 is CANCELLED status of 103071 is DONE status of 103072 is CANCELLED status of 103073 is CANCELLED status of 103074 is CANCELLED status of 103075 is CANCELLED status of 103076 is CANCELLED status of 103077 is CANCELLED status of 103078 is CANCELLED status of 103079 is CANCELLED status of 103080 is CANCELLED status of 103081 is CANCELLED status of 103082 is CANCELLED status of 103083 is DONE status of 103084 is CANCELLED status of 103085 is DONE status of 103086 is DONE
In [59]:
cancel_all_jobs()
In [ ]:
def ##gregor1_yaml## = user_data/hakon/YML/gregor_step1.ftl.yml;
def ##gregor2_yaml## = user_data/hakon/YML/gregor_step2.ftl.yml;
def ##gregor_cnv_yaml## = user_data/hakon/YML/gregor_cnvs_anno.ftl.yml;
def ##skip## = skip -2;
def ##tr## = tr3;
/* def #selcol# = select 1,2,Reference,Call,gene_symbol,GT_Info,zygosity_proband,zygosity_mother,zygosity_father,callratio_proband,callratio_father,callratio_mother,annotation,gmb_disease_info,gmb_disease_status,sequence_variant,VEP_Max_Consequence,internal_af,gnomad_af,homCount,homCount_info,GDX_classification,ext_classification,cnv_overlap,cnv_overlap_info,gene_hpocodes,gene_highrank,gene_cov_10x,PN,HGNC_id,vep_gene_symbol,Feature,ref_af,internal_ac,internal_homCount,xa_aff_af,xa_aff_ac,xa_aff_homCount,gnomad_homcount,gnomad_v4_1_AF,gnomad_v4_1_homcount,Consequence,VEP_Impact,Amino_acids,Protein_position,Protein_Size,Biotype,EXON,Refgene,qc_consequence_rank,CallRatio,CallCopies,Depth,FILTER,GL_Call,qc_in_exon,qc_protected_vars,qc_high_mod_impact,qc_in_roi,qc_known_vars,qc_closeby_known_vars,fidel_bp4_pp3,spliceai_max_consequence,spliceai_max_impact,Fidel_REVEL,Fidel_PRECPAT,Fidel_PRECBEN,Fidel_CALIBRATED,Fidel_ALPHA,Provean,GT_IHE,GT_Paternity,ratio_breakdown,father_call,father_CallCopies,father_CallRatio,father_Depth,father_ratio_breakdown,father_ApprDepth,father_GT,father_GTx,mother_call,mother_CallCopies,mother_CallRatio,mother_Depth,mother_ratio_breakdown,mother_ApprDepth,mother_GT,mother_GTx,Gene_cov,gene_cov_15x,Gene_avg_depth,Father_goodCov,Mother_goodCov,diag_denovo,male_cases_subjWithVar,male_cases_subjWithHomVar,female_cases_subjWithVar,female_cases_subjWithHomVar,male_controls_subjWithVar,male_controls_subjWithHomVar,female_controls_subjWithVar,female_controls_subjWithHomVar,male_cases_VarCovered,female_cases_VarCovered,male_controls_VarCovered,female_controls_VarCovered,LHZ_vars,LHZ_size,lhz_transfrac,category,DIAG_ACMGcat,lof_pli,lof_oe,mis_z_score,syn_z_score,lof_z_score,qc_repeat_regions,qc_in_coding_exon,zygosity,auto_evidence,isNMDtranscript,trigger50bpRule,MPC,HM_Sampvars_upstream,HM_Sampvars_downstream,HM_Sampvars_sameexon,HM_Pathvars_upstream,HM_Pathvars_downstream,HM_Pathvars_sameexon,is_path,index_subjWithVar,index_subjWithHomVar,father_affstat,mother_affstat,diag_dominant,diag_homrecess,index_subjCompHeterInGeneTrans,index_subjCompHeterInGene,index_subjWithVarInGene,index_subjWithHomVarInGene,male_cases_subjCompHeterInGene,male_cases_subjWithVarInGene,male_cases_subjWithHomVarInGene,male_cases_subjWithGeneCovered,female_cases_subjCompHeterInGene,female_cases_subjWithVarInGene,female_cases_subjWithHomVarInGene,female_cases_subjWithGeneCovered,male_controls_subjCompHeterInGene,male_controls_subjWithVarInGene,male_controls_subjWithHomVarInGene,male_controls_subjWithGeneCovered,female_controls_subjCompHeterInGene,female_controls_subjWithVarInGene,female_controls_subjWithHomVarInGene,female_controls_subjWithGeneCovered,GT_paternity_CHZ,diag_chz_solo,diag_chz,gmb_categorical_bin,gmb_inheritance,primary_transcript,gt_inheritance,vep_max_impact,auto_classification_simple,auto_score,auto_classification,GT,classification,clinvar_class,clinvar_stars,hgmd_id,hgmd_class,hgmd_disease,cnv_paternity,MULTI_prob,MULTI_prob_loeq,MULTI_rank,MIMI_score,MIMI_rank,GDX_phenovar,gmb_is_dom_validated,gmb_is_ar_validated,gmb_is_ar_candidate,gmb_is_xl_validated,gmb_is_any_ar_validated,gmb_is_alldom_validated,acmg_secondary,incidental,GDX_view,cdot,pdot,gnomad_af_info,internal_af_info,gmb_is_xr_validated,pheno_score,norm_pheno_score,gene_highrankx,study_name;
*/
create #Q2# = nor SubjectReports/Participants.rep.link
| inset -c pn <(nor -asdict source/var/wgs_varcalls.gord | select #2)
| inset -c pn <(nor -asdict source/cov/goodcov_8.wgs.gord | select #2)
| inset -c pn <(nor -asdict source/cov/gene_cov_all_seg.gord | select #2)
| inset -c pn <(nor -asdict source/var/cnv_varcalls.gord | select #2)
| select pn,kind,affected,sex,study_name
| replace affected if(affected='yes','affected','unaffected')
| pivot -gc study_name kind -v index,father,mother -e ''
| hide index_affected,father_sex,mother_sex;
nor [#Q2#]
/* | inset -c study_name [grid: 'Query'] nor -r cases | grep trn */
| where index_pn != ''
| rownum
| calc batchref if(rownum > 50," | signature -file [#gregcnv_"+str(rownum-50)+"#]","")
| calc cmd1 "create #greg1_"+rownum+"# = gor ##gregor1_yaml##(index_case = '"+index_pn+"', gender = '"+index_sex+"', mother= '"+mother_pn+"', father = '"+father_pn+"', mother_affstat = '"+mother_affected+"', father_affstat = '"+father_affected+"', gor_or_pgor = 'gor')
| ##skip## | signature -file ##gregor1_yaml##"+batchref+";"
| calc cmd2 "
create #greg2_"+rownum+"# = gor ##gregor2_yaml##(index_case = '"+index_pn+"', mother= '"+mother_pn+"', father = '"+father_pn+"', mother_affstat = '"+mother_affected+"', father_affstat = '"+father_affected+"', prepopulated_result = [#greg1_"+rownum+"#], study_path = 'studies/"+study_name+"'
, output_table = '##gregor2_result##') | ##skip## | signature -file ##gregor2_yaml##"+batchref+";"
| calc cmd3 "create #w1_"+rownum+"# = gor [#greg1_"+rownum+"#] | write s3data://shared/studies/"+study_name+"/##tr##/gregor_step1.gorz;"
| calc cmd4 "create #w2_"+rownum+"# = gor [#greg2_"+rownum+"#] | merge user_data/hakon/gregor2_df.gor | #selcol# | write studies/"+study_name+"/##tr##/gregor_step2.gorz;"
| calc cmd5 "create #gregcnv_"+rownum+"# = gor ##gregor_cnv_yaml##(index_case = '"+index_pn+"', mother= '"+mother_pn+"', father = '"+father_pn+"', mother_affstat = '"+mother_affected+"', father_affstat = '"+father_affected+"', prepopulated_result = [#greg1_"+rownum+"#], study_path = 'studies/"+study_name+"'
, output_table = '##final##') | signature -file ##gregor_cnv_yaml## | skip -1"+batchref+";"
| replace cmd5 replace(cmd5,"[#greg1_"+rownum+"#]","studies/"+study_name+"/##tr##/gregor_step1.gorz")
| calc cmd6 "create #w3_"+rownum+"# = gor [#gregcnv_"+rownum+"#] | ##skip## | write studies/"+study_name+"/##tr##/gregor_cnv.gorz;"
| calc cmd /* cmd1+' '+cmd2+' '+cmd3+' '+cmd4 */ cmd5 + ' '+ cmd6
/*
| calc filepath '../../studies/'+study_name+'/##tr##/gregor_cnv.gorz' | calc alias study_name
| select filepath,study_name
| write user_data/hakon/##tr##_gregor_cnv.gord
*/