Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
In this tutorial, you will learn how to create and run a notebook in JupyterLab on the platform, download data from the notebook, and upload results to the platform.
%%bash
dx download input_data/reads.fastq%%bash
dx upload results.csvUsing MONAI Core, MONAI Label/3D Slicer (SlicerJupyter) via JupyterLab
Use Jupyter notebooks on the DNAnexus Platform to craft sophisticated custom analyses in your preferred coding language.
dx run app-dxjupyterlabdx run app-dxjupyterlab_spark_clusterdx run -h APP_NAME


my_cmd="papermill notebook.ipynb output_notebook.ipynb"
dx run dxjupyterlab -icmd="$my_cmd" -iin="notebook.ipynb"Learn to use the JupyterLab Spark Cluster app.
Using Stata via JupyterLab, working with project files, and creating datasets with Spark.
This page is a reference for most useful operations and features in the JupyterLab environment.



import pyspark
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)spark.sql("show databases").show(truncate=False)+------------------------------------------------------------+
|namespace |
+------------------------------------------------------------+
|database_xxxx__brca_pheno |
|database_yyyy__gwas_vitamind_chr1 |
|database_zzzz__meta_data |
|database_tttt__genomics_180820 |
+------------------------------------------------------------+db = "database_xxxx__brca_pheno"
spark.sql(f"SHOW TABLES FROM {db}").show(truncate=False)+------------------------------------+-----------+-----------+
|namespace |tableName |isTemporary|
+------------------------------------+-----------+-----------+
|database_xxxx__brca_pheno |cna |false |
|database_xxxx__brca_pheno |methylation|false |
|database_xxxx__brca_pheno |mrna |false |
|database_xxxx__brca_pheno |mutations |false |
|database_xxxx__brca_pheno |patient |false |
|database_xxxx__brca_pheno |sample |false |
+------------------------------------+-----------+-----------+show databases like "<project_id_pattern>:<database_name_pattern>";
show databases like "project-*:<database_name>";+------------------------------------------------------------+
|namespace |
+------------------------------------------------------------+
|database_xxxx__brca_pheno |
|database_yyyy__gwas_vitamind_chr1 |
|database_zzzz__meta_data |
|database_tttt__genomics_180820 |
+------------------------------------------------------------+db = "database_xxxx__brca_pheno"
spark.sql(f"SHOW TABLES FROM {db}").show(truncate=False)# Create a database
my_database = "my_database"
spark.sql("create database " + my_database + " location 'dnax://'")
spark.sql("create table " + my_database + ".foo (k string, v string) using parquet")
spark.sql("insert into table " + my_database + ".foo values ('1', '2')")
sql("select * from " + my_database + ".foo")import hail as hl
hl.init(sc=sc)# Download example data from 1k genomes project and inspect the matrix table
hl.utils.get_1kg('data/')
hl.import_vcf('data/1kg.vcf.bgz').write('data/1kg.mt', overwrite=True)
mt = hl.read_matrix_table('data/1kg.mt')
mt.rows().select().show(5){
"command": [
"docker",
"run",
"-i",
"-v",
"/cluster/vep:/root/.vep",
"dnanexus/dxjupyterlab-vep",
"./vep",
"--format",
"vcf",
"__OUTPUT_FORMAT_FLAG__",
"--everything",
"--allele_number",
"--no_stats",
"--cache",
"--offline",
"--minimal",
"--assembly",
"GRCh38",
"-o",
"STDOUT",
"--check_existing",
"--dir_cache",
"/root/.vep/",
"--dir_plugins",
"/root/.vep/Plugins/loftee",
"--fasta",
"/root/.vep/homo_sapiens/109_GRCh38/Homo_sapiens.GRCh38.dna.toplevel.fa.gz",
"--plugin",
"LoF,loftee_path:/root/.vep/Plugins/loftee,human_ancestor_fa:/root/.vep/human_ancestor.fa,conservation_file:/root/.vep/loftee.sql,gerp_bigwig:/root/.vep/gerp_conservation_scores.homo_sapiens.GRCh38.bw"
],
"env": {
"PERL5LIB": "/root/.vep/Plugins"
},
"vep_json_schema": "Struct{assembly_name:String,allele_string:String,ancestral:String,colocated_variants:Array[Struct{aa_allele:String,aa_maf:Float64,afr_allele:String,afr_maf:Float64,allele_string:String,amr_allele: String,amr_maf:Float64,clin_sig:Array[String],end:Int32,eas_allele:String,eas_maf:Float64,ea_allele:String,ea_maf:Float64,eur_allele:String,eur_maf:Float64,exac_adj_allele:String,exac_adj_maf:Float64,exac_allele: String,exac_afr_allele:String,exac_afr_maf:Float64,exac_amr_allele:String,exac_amr_maf:Float64,exac_eas_allele:String,exac_eas_maf:Float64,exac_fin_allele:String,exac_fin_maf:Float64,exac_maf:Float64,exac_nfe_allele: String,exac_nfe_maf:Float64,exac_oth_allele:String,exac_oth_maf:Float64,exac_sas_allele:String,exac_sas_maf:Float64,id:String,minor_allele:String,minor_allele_freq:Float64,phenotype_or_disease:Int32,pubmed: Array[Int32],sas_allele:String,sas_maf:Float64,somatic:Int32,start:Int32,strand:Int32}],context:String,end:Int32,id:String,input:String,intergenic_consequences:Array[Struct{allele_num:Int32,consequence_terms: Array[String],impact:String,minimised:Int32,variant_allele:String}],most_severe_consequence:String,motif_feature_consequences:Array[Struct{allele_num:Int32,consequence_terms:Array[String],high_inf_pos:String,impact: String,minimised:Int32,motif_feature_id:String,motif_name:String,motif_pos:Int32,motif_score_change:Float64,strand:Int32,variant_allele:String}],regulatory_feature_consequences:Array[Struct{allele_num:Int32,biotype: String,consequence_terms:Array[String],impact:String,minimised:Int32,regulatory_feature_id:String,variant_allele:String}],seq_region_name:String,start:Int32,strand:Int32,transcript_consequences: Array[Struct{allele_num:Int32,amino_acids:String,appris:String,biotype:String,canonical:Int32,ccds:String,cdna_start:Int32,cdna_end:Int32,cds_end:Int32,cds_start:Int32,codons:String,consequence_terms:Array[String], distance:Int32,domains:Array[Struct{db:String,name:String}],exon:String,gene_id:String,gene_pheno:Int32,gene_symbol:String,gene_symbol_source:String,hgnc_id:String,hgvsc:String,hgvsp:String,hgvs_offset:Int32,impact: String,intron:String,lof:String,lof_flags:String,lof_filter:String,lof_info:String,minimised:Int32,polyphen_prediction:String,polyphen_score:Float64,protein_end:Int32,protein_start:Int32,protein_id:String, sift_prediction:String,sift_score:Float64,strand:Int32,swissprot:String,transcript_id:String,trembl:String,tsl:Int32,uniparc:String,variant_allele:String}],variant_class:String}"
}# Annotation process relies on "dnanexus/dxjupyterlab-vep" docker container
# as well as VEP and LoF resources that are pre-installed on every Spark node when
# HAIL-VEP feature is selected.
annotated_mt = hl.vep(mt, "file:///mnt/project/vep-GRCh38.json")source /home/dnanexus/environment
source /cluster/dx-cluster.environment#read csv from public bucket
df = spark.read.options(delimiter='\t', header='True', inferSchema='True').csv("s3://1000genomes/20131219.populations.tsv")
df.select(df.columns[:4]).show(10, False)#access private data in S3 by first unsetting the default credentials provider
sc._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', '')
# replace "redacted" with your keys
sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', 'redacted')
sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', 'redacted')
df=spark.read.csv("s3a://your_private_bucket/your_path_to_csv")
df.select(df.columns[:5]).show(10, False){
"license": {
"serialNumber": "<Serial number from Stata>",
"code": "<Code from Stata>",
"authorization": "<Authorization from Stata>",
"user": "<Registered user line 1>",
"organization": "<Registered user line 2>"
}
}{
"licenseFile": {
"$dnanexus_link": {
"id": "file-xxxx",
"project": "project-yyyy"
}
}
}!dx download project-xxxx:file-yyyuse /mnt/project/<path>/data_in.dtaimport delimited /mnt/project/<path>/data_in.csvsave data_outexport delimited data_out.csv!dx upload <file> --destination=<destination>dx upload <file> --destination=<destination>pandas_df = spark_df.toPandas()pandas_df.to_stata("data_out.dta")
pandas_df.to_csv("data_out.csv")%%bash
dx upload <file>%%bash
dx download input_data/reads.fastq! dx download input_data/reads.fastqimport dxpy
dxpy.download_dxfile(dxid='file-xxxx',
filename='unique_name.txt')%%bash
dx upload Readme.ipynbimport dxpy
dxpy.upload_local_file('variants.vcf')$ dx pwd
MyProject:/%%bash
pip install torch
pip install torchvision
conda install -c conda-forge opencvmy_cmd="papermill notebook.ipynb output_notebook.ipynb"
dx run dxjupyterlab -icmd="$my_cmd" -iin="notebook.ipynb"# NVIDIA-smi
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06 Driver Version: 470.129.06 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
// Let's upgrade CUDA 11.4 to 12.5
# apt-get update
# apt-get -y install cuda-toolkit-12-5 cuda-compat-12-5
# echo /usr/local/cuda/compat > /etc/ld.so.conf.d/NVIDIA-compat.conf
# ldconfig
# NVIDIA-smi
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.256.02 Driver Version: 470.256.02 CUDA Version: 12.5 |
|-------------------------------+----------------------+----------------------+
// CUDA 12.5 is now usable from terminal and notebooks

dx commands, extract_dataset and extract_assay germline, let you either retrieve the data dictionary of a dataset or extract the underlying data described by that dictionary. You can also use these commands to get dataset metadata, such as the names and titles of entities and fields, or to list all relevant assays in a dataset.import pyspark
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)install.packages("sparklyr")
library(sparklyr)
port <- Sys.getenv("SPARK_MASTER_PORT")
master <- paste("spark://master:", port, sep = '')
sc = spark_connect(master)retrieve_sql = 'select .... from .... '
df = spark.sql(retrieve_sql)library(DBI)
retrieve_sql <- 'select .... from .... '
df = dbGetQuery(sc, retrieve_sql)import subprocess
cmd = ["dx", "extract_dataset", dataset, "--fields", "entity1.field1, entity1.field2, entity2.field4", "--sql", "-o", "extracted_data.sql"]
subprocess.check_call(cmd)cmd <- paste("dx extract_dataset", dataset, " --fields", "entity1.field1, entity1.field2, entity2.field4", "--sql", "-o extracted_data.sql")
system(cmd)import subprocess
cmd = ["dx", "extract_assay", "germline", dataset, "--retrieve-allele", "allele_filter.json", "--sql", "-o", "extract_allele.sql"]
subprocess.check_call(cmd)cmd <- paste("dx extract_assay", "germline", dataset, "--retrieve-allele", "allele_filter.json", "--sql", "-o extracted_allele.sql")
system(cmd)with open("extracted_data.sql", "r") as file:
retrieve_sql=""
for line in file:
retrieve_sql += line.strip()
df = spark.sql(retrieve_sql.strip(";"))install.packages("tidyverse")
library(readr)
retrieve_sql <-read_file("extracted_data.sql")
retrieve_sql <- gsub("[;\n]", "", retrieve_sql)
df <- dbGetQuery(sc, retrieve_sql)