Run Gregor studies in batch¶

RunStudies

In [1]:

%%capture
# load the magic extension and imports
%reload_ext nextcode
import pandas as pd
import GOR_query_helper as GQH
%env LOG_QUERY=1

project = "test-hg19"
%env GOR_API_PROJECT={project}

In [2]:

import nextcode
# svc_queryserver = nextcode.get_service("queryserver")
svc_query = nextcode.get_service("query")

In [3]:

alljobs = []
def initialize_jobs():
    global alljobs
    alljobs = []
def add_job(id):
    alljobs.append(id)
def print_job_statuses():
    for id in alljobs:
        job = svc_query.get_query(id)
        print(f"status of {id} is "+job.status)
def cancel_all_jobs():
    for id in alljobs:
        job = svc_query.get_query(id)
        if job.status == 'RUNNING' or job.status == 'PENDING':
            job.cancel()

In [4]:

%%gor manifest <<
nor SubjectReports/Participants.rep.link
| select pn,kind,affected,sex,study_name
| replace affected if(affected='yes','affected','unaffected')
| pivot -gc study_name kind -v index,father,mother -e ''
| hide index_affected,father_sex,mother_sex

Query ran in 0.68 sec
Query fetched 692 rows in 0.07 sec (total time 0.75 sec)

In [50]:

manifest

Out[50]:

	study_name	index_pn	index_sex	father_pn	father_affected	mother_pn	mother_affected
0	GCA825341	TEST_2941719	female	NaN	NaN	NaN	NaN
1	GCA802054	TEST_2909078	male	NaN	NaN	NaN	NaN
2	GCA863723	TEST_2992440	female	NaN	NaN	NaN	NaN
3	GCA820675	TEST_2936504	female	NaN	NaN	NaN	NaN
4	GCA833767	TEST_2953629	female	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...
687	GCA852666	TEST_2977861	male	NaN	NaN	NaN	NaN
688	GCA792495	TEST_2901892	male	TEST_2895471	unaffected	TEST_2901923	unaffected
689	GCA817697	TEST_2925474	female	NaN	NaN	TEST_2919688	unaffected
690	GCA819745	TEST_2931900	male	TEST_2931924	unaffected	TEST_2931915	unaffected
691	GCA855762	TEST_2983016	female	TEST_2983018	unaffected	TEST_2983017	unaffected

692 rows × 7 columns

In [2]:

initialize_jobs()

def NaN2E(x):
    if str(x) == 'nan' or str(x) == 'NaN': return ''
    else: return x

import datetime
print(datetime.datetime.now())

for i in range(0,len(manifest)):errrrrrrrorrrrrhere
    #print(f"{manifest.at[i,'study_name']}")
    gregor_queries = f"""def ##case_id## = {manifest.at[i,'study_name']};
def ##index_case## = '{manifest.at[i,'index_pn']}';
def ##father## = '{NaN2E(manifest.at[i,'father_pn'])}';
def ##father_affstat## = '{NaN2E(manifest.at[i,'father_affected'])}';
def ##mother## = '{NaN2E(manifest.at[i,'mother_pn'])}';
def ##mother_affstat## = '{NaN2E(manifest.at[i,'mother_affected'])}';
def ##index_gender## = '{manifest.at[i,'index_sex']}';
def ##cand_genes## = studies/##case_id##/candidate_genes.gor;
def ##gregor1_yaml## = user_data/hakon/YML/gregor_step1.ftl.yml;
def ##gregor2_yaml## = user_data/hakon/YML/gregor_step2.ftl.yml;
def ##gregor_mimi_yaml## = user_data/hakon/YML/gregor_mimi.ftl.yml;
def ##gregor_gene_yaml## = user_data/hakon/YML/gregor_gene_overview.ftl.yml;
def ##gor_or_pgor## = gor;
create #greg1# = gor ##gregor1_yaml##(index_case = ##index_case##, gender = ##index_gender##, mother= ##mother##, father = ##father##, gor_or_pgor = ##gor_or_pgor##)
| signature -file ##gregor1_yaml##;
create #greg2# = gor ##gregor2_yaml##(index_case = ##index_case##, mother= ##mother##, father = ##father##, prepopulated_result = [#greg1#], study_path = studies/##case_id##
, output_table = '##gregor2_result##')
| columnreorder chrom,pos,reference,call,Gene_symbol,gt_info,zygosity_proband*,zygosity_mother*,zygosity_father*,callratio_*,annotation,gmb_disease_info,gmb_disease_status,sequence_variant,VEP_Max_Consequence,internal_af,gnomad_af,homCount,homCount_info,GDX_classification,ext_classification,cnv_overlap,cnv_overlap_info,gene_hpocodes,gene_highrank,gene_cov_10x;
create #w1# = gor [#greg1#] | write s3data://shared/studies/##case_id##/tr/gregor_step1.gorz;
create #w2# = gor [#greg2#] | write studies/##case_id##/tr/gregor_step2.gorz;
gor [#greg2#] | top 1
"""
    job = svc_query.execute(gregor_queries,job_type="lord",nowait="true")
    id = job.id
    print(f"""study {manifest.at[i,'study_name']}, no {i}, job {id}""")
    print(gregor_queries)
    add_job(id)
    
print(datetime.datetime.now())

  Cell In[2], line 12
    gregor_queries = f"""def ##case_id## = {manifest.at[i,'study_name']};
    ^
IndentationError: unexpected indent

In [60]:

print_job_statuses()

status of 102880 is DONE
status of 102881 is DONE
status of 102882 is DONE
status of 102883 is DONE
status of 102884 is DONE
status of 102885 is DONE
status of 102886 is DONE
status of 102887 is DONE
status of 102888 is DONE
status of 102889 is DONE
status of 102890 is DONE
status of 102891 is CANCELLED
status of 102892 is CANCELLED
status of 102893 is DONE
status of 102894 is CANCELLED
status of 102895 is DONE
status of 102896 is CANCELLED
status of 102897 is DONE
status of 102898 is CANCELLED
status of 102899 is CANCELLED
status of 102900 is DONE
status of 102901 is DONE
status of 102902 is DONE
status of 102903 is DONE
status of 102904 is DONE
status of 102905 is CANCELLED
status of 102906 is DONE
status of 102907 is DONE
status of 102908 is DONE
status of 102909 is DONE
status of 102910 is DONE
status of 102911 is DONE
status of 102912 is CANCELLED
status of 102913 is DONE
status of 102914 is DONE
status of 102915 is DONE
status of 102916 is CANCELLED
status of 102917 is FAILED
status of 102918 is CANCELLED
status of 102919 is DONE
status of 102920 is CANCELLED
status of 102921 is CANCELLED
status of 102922 is CANCELLED
status of 102923 is CANCELLED
status of 102924 is CANCELLED
status of 102925 is CANCELLED
status of 102926 is CANCELLED
status of 102927 is CANCELLED
status of 102928 is CANCELLED
status of 102929 is CANCELLED
status of 102930 is DONE
status of 102931 is CANCELLED
status of 102932 is CANCELLED
status of 102933 is DONE
status of 102934 is CANCELLED
status of 102935 is CANCELLED
status of 102936 is DONE
status of 102937 is CANCELLED
status of 102938 is CANCELLED
status of 102939 is DONE
status of 102940 is CANCELLED
status of 102941 is DONE
status of 102942 is DONE
status of 102943 is CANCELLED
status of 102944 is DONE
status of 102945 is DONE
status of 102946 is CANCELLED
status of 102947 is CANCELLED
status of 102948 is CANCELLED
status of 102949 is DONE
status of 102950 is CANCELLED
status of 102951 is DONE
status of 102952 is DONE
status of 102953 is DONE
status of 102954 is DONE
status of 102955 is CANCELLED
status of 102956 is CANCELLED
status of 102957 is CANCELLED
status of 102958 is CANCELLED
status of 102959 is DONE
status of 102960 is DONE
status of 102961 is CANCELLED
status of 102962 is DONE
status of 102963 is CANCELLED
status of 102964 is DONE
status of 102965 is CANCELLED
status of 102966 is DONE
status of 102967 is DONE
status of 102968 is CANCELLED
status of 102969 is CANCELLED
status of 102970 is CANCELLED
status of 102971 is CANCELLED
status of 102972 is CANCELLED
status of 102973 is DONE
status of 102974 is DONE
status of 102975 is CANCELLED
status of 102976 is CANCELLED
status of 102977 is DONE
status of 102978 is CANCELLED
status of 102979 is CANCELLED
status of 102980 is CANCELLED
status of 102981 is CANCELLED
status of 102982 is CANCELLED
status of 102983 is CANCELLED
status of 102984 is DONE
status of 102985 is CANCELLED
status of 102986 is CANCELLED
status of 102987 is CANCELLED
status of 102988 is CANCELLED
status of 102989 is CANCELLED
status of 102990 is DONE
status of 102991 is CANCELLED
status of 102992 is CANCELLED
status of 102993 is CANCELLED
status of 102994 is CANCELLED
status of 102995 is CANCELLED
status of 102996 is CANCELLED
status of 102997 is DONE
status of 102998 is CANCELLED
status of 102999 is CANCELLED
status of 103000 is CANCELLED
status of 103001 is DONE
status of 103002 is CANCELLED
status of 103003 is CANCELLED
status of 103004 is CANCELLED
status of 103005 is CANCELLED
status of 103006 is DONE
status of 103007 is CANCELLED
status of 103008 is CANCELLED
status of 103009 is DONE
status of 103010 is CANCELLED
status of 103011 is DONE
status of 103012 is CANCELLED
status of 103013 is CANCELLED
status of 103014 is CANCELLED
status of 103015 is CANCELLED
status of 103016 is DONE
status of 103017 is DONE
status of 103018 is DONE
status of 103019 is CANCELLED
status of 103020 is DONE
status of 103021 is CANCELLED
status of 103022 is CANCELLED
status of 103023 is CANCELLED
status of 103024 is CANCELLED
status of 103025 is CANCELLED
status of 103026 is DONE
status of 103027 is CANCELLED
status of 103028 is CANCELLED
status of 103029 is CANCELLED
status of 103030 is CANCELLED
status of 103031 is CANCELLED
status of 103032 is CANCELLED
status of 103033 is CANCELLED
status of 103034 is DONE
status of 103035 is DONE
status of 103036 is CANCELLED
status of 103037 is DONE
status of 103038 is CANCELLED
status of 103039 is CANCELLED
status of 103040 is CANCELLED
status of 103041 is DONE
status of 103042 is CANCELLED
status of 103043 is CANCELLED
status of 103044 is CANCELLED
status of 103045 is CANCELLED
status of 103046 is DONE
status of 103047 is CANCELLED
status of 103048 is CANCELLED
status of 103049 is CANCELLED
status of 103050 is DONE
status of 103051 is CANCELLED
status of 103052 is DONE
status of 103053 is CANCELLED
status of 103054 is DONE
status of 103055 is CANCELLED
status of 103056 is CANCELLED
status of 103057 is DONE
status of 103058 is DONE
status of 103059 is CANCELLED
status of 103060 is CANCELLED
status of 103061 is DONE
status of 103062 is CANCELLED
status of 103063 is DONE
status of 103064 is CANCELLED
status of 103065 is DONE
status of 103066 is CANCELLED
status of 103067 is CANCELLED
status of 103068 is DONE
status of 103069 is CANCELLED
status of 103070 is CANCELLED
status of 103071 is DONE
status of 103072 is CANCELLED
status of 103073 is CANCELLED
status of 103074 is CANCELLED
status of 103075 is CANCELLED
status of 103076 is CANCELLED
status of 103077 is CANCELLED
status of 103078 is CANCELLED
status of 103079 is CANCELLED
status of 103080 is CANCELLED
status of 103081 is CANCELLED
status of 103082 is CANCELLED
status of 103083 is DONE
status of 103084 is CANCELLED
status of 103085 is DONE
status of 103086 is DONE

In [59]:

cancel_all_jobs()

In [ ]:

def ##gregor1_yaml## = user_data/hakon/YML/gregor_step1.ftl.yml;
def ##gregor2_yaml## = user_data/hakon/YML/gregor_step2.ftl.yml;
def ##gregor_cnv_yaml## = user_data/hakon/YML/gregor_cnvs_anno.ftl.yml;
def ##skip## = skip -2;
def ##tr## = tr3;

/* def #selcol# = select 1,2,Reference,Call,gene_symbol,GT_Info,zygosity_proband,zygosity_mother,zygosity_father,callratio_proband,callratio_father,callratio_mother,annotation,gmb_disease_info,gmb_disease_status,sequence_variant,VEP_Max_Consequence,internal_af,gnomad_af,homCount,homCount_info,GDX_classification,ext_classification,cnv_overlap,cnv_overlap_info,gene_hpocodes,gene_highrank,gene_cov_10x,PN,HGNC_id,vep_gene_symbol,Feature,ref_af,internal_ac,internal_homCount,xa_aff_af,xa_aff_ac,xa_aff_homCount,gnomad_homcount,gnomad_v4_1_AF,gnomad_v4_1_homcount,Consequence,VEP_Impact,Amino_acids,Protein_position,Protein_Size,Biotype,EXON,Refgene,qc_consequence_rank,CallRatio,CallCopies,Depth,FILTER,GL_Call,qc_in_exon,qc_protected_vars,qc_high_mod_impact,qc_in_roi,qc_known_vars,qc_closeby_known_vars,fidel_bp4_pp3,spliceai_max_consequence,spliceai_max_impact,Fidel_REVEL,Fidel_PRECPAT,Fidel_PRECBEN,Fidel_CALIBRATED,Fidel_ALPHA,Provean,GT_IHE,GT_Paternity,ratio_breakdown,father_call,father_CallCopies,father_CallRatio,father_Depth,father_ratio_breakdown,father_ApprDepth,father_GT,father_GTx,mother_call,mother_CallCopies,mother_CallRatio,mother_Depth,mother_ratio_breakdown,mother_ApprDepth,mother_GT,mother_GTx,Gene_cov,gene_cov_15x,Gene_avg_depth,Father_goodCov,Mother_goodCov,diag_denovo,male_cases_subjWithVar,male_cases_subjWithHomVar,female_cases_subjWithVar,female_cases_subjWithHomVar,male_controls_subjWithVar,male_controls_subjWithHomVar,female_controls_subjWithVar,female_controls_subjWithHomVar,male_cases_VarCovered,female_cases_VarCovered,male_controls_VarCovered,female_controls_VarCovered,LHZ_vars,LHZ_size,lhz_transfrac,category,DIAG_ACMGcat,lof_pli,lof_oe,mis_z_score,syn_z_score,lof_z_score,qc_repeat_regions,qc_in_coding_exon,zygosity,auto_evidence,isNMDtranscript,trigger50bpRule,MPC,HM_Sampvars_upstream,HM_Sampvars_downstream,HM_Sampvars_sameexon,HM_Pathvars_upstream,HM_Pathvars_downstream,HM_Pathvars_sameexon,is_path,index_subjWithVar,index_subjWithHomVar,father_affstat,mother_affstat,diag_dominant,diag_homrecess,index_subjCompHeterInGeneTrans,index_subjCompHeterInGene,index_subjWithVarInGene,index_subjWithHomVarInGene,male_cases_subjCompHeterInGene,male_cases_subjWithVarInGene,male_cases_subjWithHomVarInGene,male_cases_subjWithGeneCovered,female_cases_subjCompHeterInGene,female_cases_subjWithVarInGene,female_cases_subjWithHomVarInGene,female_cases_subjWithGeneCovered,male_controls_subjCompHeterInGene,male_controls_subjWithVarInGene,male_controls_subjWithHomVarInGene,male_controls_subjWithGeneCovered,female_controls_subjCompHeterInGene,female_controls_subjWithVarInGene,female_controls_subjWithHomVarInGene,female_controls_subjWithGeneCovered,GT_paternity_CHZ,diag_chz_solo,diag_chz,gmb_categorical_bin,gmb_inheritance,primary_transcript,gt_inheritance,vep_max_impact,auto_classification_simple,auto_score,auto_classification,GT,classification,clinvar_class,clinvar_stars,hgmd_id,hgmd_class,hgmd_disease,cnv_paternity,MULTI_prob,MULTI_prob_loeq,MULTI_rank,MIMI_score,MIMI_rank,GDX_phenovar,gmb_is_dom_validated,gmb_is_ar_validated,gmb_is_ar_candidate,gmb_is_xl_validated,gmb_is_any_ar_validated,gmb_is_alldom_validated,acmg_secondary,incidental,GDX_view,cdot,pdot,gnomad_af_info,internal_af_info,gmb_is_xr_validated,pheno_score,norm_pheno_score,gene_highrankx,study_name;
*/

create #Q2# = nor SubjectReports/Participants.rep.link
| inset -c pn <(nor -asdict source/var/wgs_varcalls.gord | select #2)
| inset -c pn <(nor -asdict source/cov/goodcov_8.wgs.gord | select #2)
| inset -c pn <(nor -asdict source/cov/gene_cov_all_seg.gord | select #2)
| inset -c pn <(nor -asdict source/var/cnv_varcalls.gord | select #2)

| select pn,kind,affected,sex,study_name
| replace affected if(affected='yes','affected','unaffected')
| pivot -gc study_name kind -v index,father,mother -e ''
| hide index_affected,father_sex,mother_sex;
                
nor [#Q2#] 
/* | inset -c study_name [grid: 'Query'] nor -r cases | grep trn */
| where index_pn != ''
| rownum
| calc batchref if(rownum > 50," | signature -file [#gregcnv_"+str(rownum-50)+"#]","")
| calc cmd1 "create #greg1_"+rownum+"# = gor ##gregor1_yaml##(index_case = '"+index_pn+"', gender = '"+index_sex+"', mother= '"+mother_pn+"', father = '"+father_pn+"', mother_affstat = '"+mother_affected+"', father_affstat = '"+father_affected+"', gor_or_pgor = 'gor')
| ##skip## | signature -file ##gregor1_yaml##"+batchref+";"
| calc cmd2 "
create #greg2_"+rownum+"# = gor ##gregor2_yaml##(index_case = '"+index_pn+"', mother= '"+mother_pn+"', father = '"+father_pn+"', mother_affstat = '"+mother_affected+"', father_affstat = '"+father_affected+"', prepopulated_result = [#greg1_"+rownum+"#], study_path = 'studies/"+study_name+"'
, output_table = '##gregor2_result##') | ##skip## | signature -file ##gregor2_yaml##"+batchref+";"
| calc cmd3 "create #w1_"+rownum+"# = gor [#greg1_"+rownum+"#] | write s3data://shared/studies/"+study_name+"/##tr##/gregor_step1.gorz;"
| calc cmd4 "create #w2_"+rownum+"# = gor [#greg2_"+rownum+"#] | merge user_data/hakon/gregor2_df.gor | #selcol# | write studies/"+study_name+"/##tr##/gregor_step2.gorz;"
| calc cmd5 "create #gregcnv_"+rownum+"# = gor ##gregor_cnv_yaml##(index_case = '"+index_pn+"', mother= '"+mother_pn+"', father = '"+father_pn+"', mother_affstat = '"+mother_affected+"', father_affstat = '"+father_affected+"', prepopulated_result = [#greg1_"+rownum+"#], study_path = 'studies/"+study_name+"'
, output_table = '##final##') | signature -file ##gregor_cnv_yaml## | skip -1"+batchref+";"
 | replace cmd5 replace(cmd5,"[#greg1_"+rownum+"#]","studies/"+study_name+"/##tr##/gregor_step1.gorz") 
 | calc cmd6 "create #w3_"+rownum+"# = gor [#gregcnv_"+rownum+"#] | ##skip## | write studies/"+study_name+"/##tr##/gregor_cnv.gorz;"
 | calc cmd /* cmd1+' '+cmd2+' '+cmd3+' '+cmd4  */ cmd5 + ' '+ cmd6
/*
 | calc filepath '../../studies/'+study_name+'/##tr##/gregor_cnv.gorz' | calc alias study_name
| select filepath,study_name 
| write user_data/hakon/##tr##_gregor_cnv.gord
*/