Support homeCell RangerDownloads
Build Notes for Reference Packages

Build Notes for Reference Packages

For information on updates to the publicly available 10x Genomics transcriptome and V(D)J references, please refer to the reference release notes.

10x Genomics offers pre-built Cell Ranger reference packages from the downloads page. For purposes of reproducibility, the exact build steps are provided here.

# Genome metadata genome="GRCh38" version="2024-A" # Set up source and build directories build="GRCh38-GENCODEv44_build" mkdir -p "$build" # Download source files if they do not exist in reference_sources/ folder source="reference_sources" mkdir -p "$source" # Using release 109 FASTA for GRCh38 instead of release 110 FASTA -- release 110 moved from GRCh38.p13 to GRCh38.p14, # which unmasked the pseudo-autosomal region. This causes ambiguous mappings to PAR locus genes. # No other sequence changes were made to the primary assembly. fasta_url="http://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" fasta_in="${source}/Homo_sapiens.GRCh38.dna.primary_assembly.fa" gtf_url="http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.primary_assembly.annotation.gtf.gz" gtf_in="${source}/gencode.v44.primary_assembly.annotation.gtf" if [ ! -f "$fasta_in" ]; then curl -sS "$fasta_url" | zcat > "$fasta_in" fi if [ ! -f "$gtf_in" ]; then curl -sS "$gtf_url" | zcat > "$gtf_in" fi # Modify sequence headers in the Ensembl FASTA to match the file # "GRCh38.primary_assembly.genome.fa" from GENCODE. Unplaced and unlocalized # sequences such as "KI270728.1" have the same names in both versions. # # Input FASTA: # >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF # # Output FASTA: # >chr1 1 fasta_modified="$build/$(basename "$fasta_in").modified" # sed commands: # 1. Replace metadata after space with original contig name, as in GENCODE # 2. Add "chr" to names of autosomes and sex chromosomes # 3. Handle the mitochrondrial chromosome cat "$fasta_in" \ | sed -E 's/^>(\S+).*/>\1 \1/' \ | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \ | sed -E 's/^>MT />chrM /' \ > "$fasta_modified" # Remove version suffix from transcript, gene, and exon IDs in order to match # previous Cell Ranger reference packages # # Input GTF: # ... gene_id "ENSG00000223972.5"; ... # Output GTF: # ... gene_id "ENSG00000223972"; gene_version "5"; ... gtf_modified="$build/$(basename "$gtf_in").modified" # Pattern matches Ensembl gene, transcript, and exon IDs for human or mouse: ID="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)" cat "$gtf_in" \ | sed -E 's/gene_id "'"$ID"'";/gene_id "\1"; gene_version "\3";/' \ | sed -E 's/transcript_id "'"$ID"'";/transcript_id "\1"; transcript_version "\3";/' \ | sed -E 's/exon_id "'"$ID"'";/exon_id "\1"; exon_version "\3";/' \ > "$gtf_modified" # Define string patterns for GTF tags # NOTES: # Since Ensembl 110, polymorphic pseudogenes are now just protein_coding. # Readthrough genes are annotated with the readthrough_transcript tag. BIOTYPE_PATTERN=\ "(protein_coding|protein_coding_LoF|lncRNA|\ IG_C_gene|IG_D_gene|IG_J_gene|IG_LV_gene|IG_V_gene|\ IG_V_pseudogene|IG_J_pseudogene|IG_C_pseudogene|\ TR_C_gene|TR_D_gene|TR_J_gene|TR_V_gene|\ TR_V_pseudogene|TR_J_pseudogene)" GENE_PATTERN="gene_type \"${BIOTYPE_PATTERN}\"" TX_PATTERN="transcript_type \"${BIOTYPE_PATTERN}\"" READTHROUGH_PATTERN="tag \"readthrough_transcript\"" # Construct the gene ID allowlist. We filter the list of all transcripts # based on these criteria: # - allowable gene_type (biotype) # - allowable transcript_type (biotype) # - no "readthrough_transcript" tag # We then collect the list of gene IDs that have at least one associated # transcript passing the filters. cat "$gtf_modified" \ | awk '$3 == "transcript"' \ | grep -E "$GENE_PATTERN" \ | grep -E "$TX_PATTERN" \ | grep -Ev "$READTHROUGH_PATTERN" \ | sed -E 's/.*(gene_id "[^"]+").*/\1/' \ | sort \ | uniq \ > "${build}/gene_allowlist" # NOTES: # Since Ensembl 110, the PAR locus genes are included on chrY as copies of chrX # Using the GRCh38.p13 assembly hard masks these regions on chrY, but removing the # chrY PAR genes is still desirable so they do not end up as extra entries in the output. # The awk command below excludes all PAR_Y genes, including XGY2. # The non-coding gene XGY2 straddles the PAR1 boundary on chrY, and is homologous to XG on chrX. # GRCh38-2024-A excludes XGY2, but includes SRY and ENSG00000286130, which are in an intron of XGY2, # and RPS4Y1, which overlaps XGY2. # Filter the GTF file based on the gene allowlist gtf_filtered="${build}/$(basename "$gtf_in").filtered" # Copy header lines beginning with "#" grep -E "^#" "$gtf_modified" > "$gtf_filtered" # Filter to the gene allowlist, and then remove PAR_Y genes grep -Ff "${build}/gene_allowlist" "$gtf_modified" \ | awk -F "\t" '$1 != "chrY" || $1 == "chrY" && $4 >= 2752083 && $4 < 56887903 && !/ENSG00000290840/' \ >> "$gtf_filtered" # Create reference package cellranger mkref --ref-version="$version" \ --genome="$genome" --fasta="$fasta_modified" --genes="$gtf_filtered" \ --nthreads=16
# Genome metadata genome="GRCm39" version="2024-A" # Set up source and build directories build="GRCm39-GENCODEv33_build" mkdir -p "$build" # Download source files if they do not exist in reference_sources/ folder source="reference_sources" mkdir -p "$source" fasta_url="http://ftp.ensembl.org/pub/release-110/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna.primary_assembly.fa.gz" fasta_in="${source}/Mus_musculus.GRCm39.dna.primary_assembly.fa" gtf_url="http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M33/gencode.vM33.primary_assembly.annotation.gtf.gz" gtf_in="${source}/gencode.vM33.primary_assembly.annotation.gtf" if [ ! -f "$fasta_in" ]; then curl -sS "$fasta_url" | zcat > "$fasta_in" fi if [ ! -f "$gtf_in" ]; then curl -sS "$gtf_url" | zcat > "$gtf_in" fi # Modify sequence headers in the Ensembl FASTA to match the file # "GRCm38.primary_assembly.genome.fa" from GENCODE. Unplaced and unlocalized # sequences such as "GL456210.1" have the same names in both versions. # # Input FASTA: # >1 dna:chromosome chromosome:GRCm38:1:1:195471971:1 REF # # Output FASTA: # >chr1 1 fasta_modified="$build/$(basename "$fasta_in").modified" # sed commands: # 1. Replace metadata after space with original contig name, as in GENCODE # 2. Add "chr" to names of autosomes and sex chromosomes # 3. Handle the mitochrondrial chromosome cat "$fasta_in" \ | sed -E 's/^>(\S+).*/>\1 \1/' \ | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \ | sed -E 's/^>MT />chrM /' \ > "$fasta_modified" # Remove version suffix from transcript, gene, and exon IDs in order to match # previous Cell Ranger reference packages # # Input GTF: # ... gene_id "ENSMUSG00000102693.1"; ... # Output GTF: # ... gene_id "ENSMUSG00000102693"; gene_version "1"; ... gtf_modified="$build/$(basename "$gtf_in").modified" # Pattern matches Ensembl gene, transcript, and exon IDs for human or mouse: ID="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)" cat "$gtf_in" \ | sed -E 's/gene_id "'"$ID"'";/gene_id "\1"; gene_version "\3";/' \ | sed -E 's/transcript_id "'"$ID"'";/transcript_id "\1"; transcript_version "\3";/' \ | sed -E 's/exon_id "'"$ID"'";/exon_id "\1"; exon_version "\3";/' \ > "$gtf_modified" # Define string patterns for GTF tags # Since Ensembl 110, polymorphic pseudogenes are now just protein_coding. # Readthrough genes are annotated with the readthrough_transcript tag. BIOTYPE_PATTERN=\ "(protein_coding|protein_coding_LoF|lncRNA|\ IG_C_gene|IG_D_gene|IG_J_gene|IG_LV_gene|IG_V_gene|\ IG_V_pseudogene|IG_J_pseudogene|IG_C_pseudogene|\ TR_C_gene|TR_D_gene|TR_J_gene|TR_V_gene|\ TR_V_pseudogene|TR_J_pseudogene)" GENE_PATTERN="gene_type \"${BIOTYPE_PATTERN}\"" TX_PATTERN="transcript_type \"${BIOTYPE_PATTERN}\"" READTHROUGH_PATTERN="tag \"readthrough_transcript\"" # Construct the gene ID allowlist. We filter the list of all transcripts # based on these criteria: # - allowable gene_type (biotype) # - allowable transcript_type (biotype) # - no "readthrough_transcript" tag # We then collect the list of gene IDs that have at least one associated # transcript passing the filters. cat "$gtf_modified" \ | awk '$3 == "transcript"' \ | grep -E "$GENE_PATTERN" \ | grep -E "$TX_PATTERN" \ | grep -Ev "$READTHROUGH_PATTERN" \ | sed -E 's/.*(gene_id "[^"]+").*/\1/' \ | sort \ | uniq \ > "${build}/gene_allowlist" # Filter the GTF file based on the gene allowlist gtf_filtered="${build}/$(basename "$gtf_in").filtered" # Copy header lines beginning with "#" grep -E "^#" "$gtf_modified" > "$gtf_filtered" # Filter to the gene allowlist grep -Ff "${build}/gene_allowlist" "$gtf_modified" \ >> "$gtf_filtered" # Create reference package cellranger mkref --ref-version="$version" \ --genome="$genome" --fasta="$fasta_modified" --genes="$gtf_filtered" \ --nthreads=16
#################### SETUP #################### human_genome="GRCh38" mouse_genome="GRCm39" version="2024-A" build="GRCh38_and_GRCm39_GENCODEv44-GENCODEvm33" mkdir -p "$build" # Download source files if they do not exist in reference_sources/ folder source="reference_sources" mkdir -p "$source" # Using release 109 for GRCh38 instead of release 110 -- release 110 moved from GRCh38.p13 to GRCh38.p14, # which unmasked the pseudo-autosomal region. This causes ambiguous mappings to PAR locus genes. human_fasta_url="http://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" human_fasta_in="${source}/Homo_sapiens.GRCh38.dna.primary_assembly.fa" human_gtf_url="http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.primary_assembly.annotation.gtf.gz" human_gtf_in="${source}/gencode.v44.primary_assembly.annotation.gtf" mouse_fasta_url="http://ftp.ensembl.org/pub/release-110/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna.primary_assembly.fa.gz" mouse_fasta_in="${source}/Mus_musculus.GRCm39.dna.primary_assembly.fa" mouse_gtf_url="http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M33/gencode.vM33.primary_assembly.annotation.gtf.gz" mouse_gtf_in="${source}/gencode.vM33.primary_assembly.annotation.gtf" if [ ! -f "$human_fasta_in" ]; then curl -sS "$human_fasta_url" | zcat > "$human_fasta_in" fi if [ ! -f "$human_gtf_in" ]; then curl -sS "$human_gtf_url" | zcat > "$human_gtf_in" fi if [ ! -f "$mouse_fasta_in" ]; then curl -sS "$mouse_fasta_url" | zcat > "$mouse_fasta_in" fi if [ ! -f "$mouse_gtf_in" ]; then curl -sS "$mouse_gtf_url" | zcat > "$mouse_gtf_in" fi # String patterns used for both genomes ID="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)" BIOTYPE_PATTERN=\ "(protein_coding|protein_coding_LoF|lncRNA|\ IG_C_gene|IG_D_gene|IG_J_gene|IG_LV_gene|IG_V_gene|\ IG_V_pseudogene|IG_J_pseudogene|IG_C_pseudogene|\ TR_C_gene|TR_D_gene|TR_J_gene|TR_V_gene|\ TR_V_pseudogene|TR_J_pseudogene)" GENE_PATTERN="gene_type \"${BIOTYPE_PATTERN}\"" TX_PATTERN="transcript_type \"${BIOTYPE_PATTERN}\"" READTHROUGH_PATTERN="tag \"readthrough_transcript\"" # Process FASTA -- translate chromosome names human_fasta_modified="$build/$(basename "$human_fasta_in").modified" cat "$human_fasta_in" \ | sed -E 's/^>(\S+).*/>\1 \1/' \ | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \ | sed -E 's/^>MT />chrM /' \ > "$human_fasta_modified" # Process GTF -- split Ensembl IDs from version suffixes human_gtf_modified="$build/$(basename "$human_gtf_in").modified" cat "$human_gtf_in" \ | sed -E 's/gene_id "'"$ID"'";/gene_id "\1"; gene_version "\3";/' \ | sed -E 's/transcript_id "'"$ID"'";/transcript_id "\1"; transcript_version "\3";/' \ | sed -E 's/exon_id "'"$ID"'";/exon_id "\1"; exon_version "\3";/' \ > "$human_gtf_modified" # Process GTF -- filter based on gene/transcript tags cat "$human_gtf_modified" \ | awk '$3 == "transcript"' \ | grep -E "$GENE_PATTERN" \ | grep -E "$TX_PATTERN" \ | grep -Ev "$READTHROUGH_PATTERN" \ | sed -E 's/.*(gene_id "[^"]+").*/\1/' \ | sort \ | uniq \ > "${build}/gene_allowlist" human_gtf_filtered="${build}/$(basename "$human_gtf_in").filtered" grep -E "^#" "$human_gtf_modified" > "$human_gtf_filtered" grep -Ff "${build}/gene_allowlist" "$human_gtf_modified" \ | awk -F "\t" '$1 != "chrY" || $1 == "chrY" && $4 >= 2752083 && $4 < 56887903 && !/ENSG00000290840/' \ >> "$human_gtf_filtered" #################### MOUSE #################### # Please see the GRCm39-2024-A build documentation for details on these steps. # Process FASTA -- translate chromosome names mouse_fasta_modified="$build/$(basename "$mouse_fasta_in").modified" cat "$mouse_fasta_in" \ | sed -E 's/^>(\S+).*/>\1 \1/' \ | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \ | sed -E 's/^>MT />chrM /' \ > "$mouse_fasta_modified" # Process GTF -- split Ensembl IDs from version suffixes mouse_gtf_modified="$build/$(basename "$mouse_gtf_in").modified" cat "$mouse_gtf_in" \ | sed -E 's/gene_id "'"$ID"'";/gene_id "\1"; gene_version "\3";/' \ | sed -E 's/transcript_id "'"$ID"'";/transcript_id "\1"; transcript_version "\3";/' \ | sed -E 's/exon_id "'"$ID"'";/exon_id "\1"; exon_version "\3";/' \ > "$mouse_gtf_modified" # Process GTF -- filter based on gene/transcript tags cat "$mouse_gtf_modified" \ | awk '$3 == "transcript"' \ | grep -E "$GENE_PATTERN" \ | grep -E "$TX_PATTERN" \ | grep -Ev "$READTHROUGH_PATTERN" \ | sed -E 's/.*(gene_id "[^"]+").*/\1/' \ | sort \ | uniq \ > "${build}/gene_allowlist" mouse_gtf_filtered="${build}/$(basename "$mouse_gtf_in").filtered" grep -E "^#" "$mouse_gtf_modified" > "$mouse_gtf_filtered" grep -Ff "${build}/gene_allowlist" "$mouse_gtf_modified" \ >> "$mouse_gtf_filtered" #################### MKREF #################### cellranger mkref --ref-version="$version" \ --genome="$human_genome" --fasta="$human_fasta_modified" --genes="$human_gtf_filtered" \ --genome="$mouse_genome" --fasta="$mouse_fasta_modified" --genes="$mouse_gtf_filtered" \ --nthreads=16
# Genome metadata genome="GRCh38" version="2020-A" # Set up source and build directories build="GRCh38-2020-A_build" mkdir -p "$build" # Download source files if they do not exist in reference_sources/ folder source="reference_sources" mkdir -p "$source" fasta_url="http://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" fasta_in="${source}/Homo_sapiens.GRCh38.dna.primary_assembly.fa" gtf_url="http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.primary_assembly.annotation.gtf.gz" gtf_in="${source}/gencode.v32.primary_assembly.annotation.gtf" if [ ! -f "$fasta_in" ]; then curl -sS "$fasta_url" | zcat > "$fasta_in" fi if [ ! -f "$gtf_in" ]; then curl -sS "$gtf_url" | zcat > "$gtf_in" fi # Modify sequence headers in the Ensembl FASTA to match the file # "GRCh38.primary_assembly.genome.fa" from GENCODE. Unplaced and unlocalized # sequences such as "KI270728.1" have the same names in both versions. # Input FASTA: # >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF # Output FASTA: # >chr1 1 fasta_modified="$build/$(basename "$fasta_in").modified" # sed commands: # 1. Replace metadata after space with original contig name, as in GENCODE # 2. Add "chr" to names of autosomes and sex chromosomes # 3. Handle the mitochondrial chromosome cat "$fasta_in" \ | sed -E 's/^>(\S+).*/>\1 \1/' \ | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \ | sed -E 's/^>MT />chrM /' \ > "$fasta_modified" # Remove version suffix from transcript, gene, and exon IDs in order to match # previous Cell Ranger reference packages # Input GTF: # ... gene_id "ENSG00000223972.5"; ... # Output GTF: # ... gene_id "ENSG00000223972"; gene_version "5"; ... gtf_modified="$build/$(basename "$gtf_in").modified" # Pattern matches Ensembl gene, transcript, and exon IDs for human or mouse: ID="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)" cat "$gtf_in" \ | sed -E 's/gene_id "'"$ID"'";/gene_id "\1"; gene_version "\3";/' \ | sed -E 's/transcript_id "'"$ID"'";/transcript_id "\1"; transcript_version "\3";/' \ | sed -E 's/exon_id "'"$ID"'";/exon_id "\1"; exon_version "\3";/' \ > "$gtf_modified" # Define string patterns for GTF tags # NOTES: # - Since GENCODE release 31/M22 (Ensembl 97), the "lincRNA" and "antisense" # biotypes are part of a more generic "lncRNA" biotype. # - These filters are relevant only to GTF files from GENCODE. The GTFs from # Ensembl release 98 have the following differences: # - The names "gene_biotype" and "transcript_biotype" are used instead of # "gene_type" and "transcript_type". # - Readthrough transcripts are present but are not marked with the # "readthrough_transcript" tag. # - Only the X chromosome versions of genes in the pseudoautosomal regions # are present, so there is no "PAR" tag. BIOTYPE_PATTERN=\ "(protein_coding|lncRNA|\ IG_C_gene|IG_D_gene|IG_J_gene|IG_LV_gene|IG_V_gene|\ IG_V_pseudogene|IG_J_pseudogene|IG_C_pseudogene|\ TR_C_gene|TR_D_gene|TR_J_gene|TR_V_gene|\ TR_V_pseudogene|TR_J_pseudogene)" GENE_PATTERN="gene_type \"${BIOTYPE_PATTERN}\"" TX_PATTERN="transcript_type \"${BIOTYPE_PATTERN}\"" READTHROUGH_PATTERN="tag \"readthrough_transcript\"" PAR_PATTERN="tag \"PAR\"" # Construct the gene ID allowlist. We filter the list of all transcripts # based on these criteria: # - allowable gene_type (biotype) # - allowable transcript_type (biotype) # - no "PAR" tag (only present for Y chromosome PAR) # - no "readthrough_transcript" tag # We then collect the list of gene IDs that have at least one associated # transcript passing the filters. cat "$gtf_modified" \ | awk '$3 == "transcript"' \ | grep -E "$GENE_PATTERN" \ | grep -E "$TX_PATTERN" \ | grep -Ev "$READTHROUGH_PATTERN" \ | grep -Ev "$PAR_PATTERN" \ | sed -E 's/.*(gene_id "[^"]+").*/\1/' \ | sort \ | uniq \ > "${build}/gene_allowlist" # Filter the GTF file based on the gene allowlist gtf_filtered="${build}/$(basename "$gtf_in").filtered" # Copy header lines beginning with "#" grep -E "^#" "$gtf_modified" > "$gtf_filtered" # Filter to the gene allowlist grep -Ff "${build}/gene_allowlist" "$gtf_modified" \ >> "$gtf_filtered" # Create reference package cellranger mkref --ref-version="$version" \ --genome="$genome" --fasta="$fasta_modified" --genes="$gtf_filtered"
# Genome metadata genome="mm10" version="2020-A" # Set up source and build directories build="mm10-2020-A_build" mkdir -p "$build" # Download source files if they do not exist in reference_sources/ folder source="reference_sources" mkdir -p "$source" fasta_url="http://ftp.ensembl.org/pub/release-98/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz" fasta_in="${source}/Mus_musculus.GRCm38.dna.primary_assembly.fa" gtf_url="http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M23/gencode.vM23.primary_assembly.annotation.gtf.gz" gtf_in="${source}/gencode.vM23.primary_assembly.annotation.gtf" if [ ! -f "$fasta_in" ]; then curl -sS "$fasta_url" | zcat > "$fasta_in" fi if [ ! -f "$gtf_in" ]; then curl -sS "$gtf_url" | zcat > "$gtf_in" fi # Modify sequence headers in the Ensembl FASTA to match the file # "GRCm38.primary_assembly.genome.fa" from GENCODE. Unplaced and unlocalized # sequences such as "GL456210.1" have the same names in both versions. # Input FASTA: # >1 dna:chromosome chromosome:GRCm38:1:1:195471971:1 REF # Output FASTA: # >chr1 1 fasta_modified="$build/$(basename "$fasta_in").modified" # sed commands: # 1. Replace metadata after space with original contig name, as in GENCODE # 2. Add "chr" to names of autosomes and sex chromosomes # 3. Handle the mitochrondrial chromosome cat "$fasta_in" \ | sed -E 's/^>(\S+).*/>\1 \1/' \ | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \ | sed -E 's/^>MT />chrM /' \ > "$fasta_modified" # Remove version suffix from transcript, gene, and exon IDs in order to match # previous Cell Ranger reference packages # Input GTF: # ... gene_id "ENSMUSG00000102693.1"; ... # Output GTF: # ... gene_id "ENSMUSG00000102693"; gene_version "1"; ... gtf_modified="$build/$(basename "$gtf_in").modified" # Pattern matches Ensembl gene, transcript, and exon IDs for human or mouse: ID="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)" cat "$gtf_in" \ | sed -E 's/gene_id "'"$ID"'";/gene_id "\1"; gene_version "\3";/' \ | sed -E 's/transcript_id "'"$ID"'";/transcript_id "\1"; transcript_version "\3";/' \ | sed -E 's/exon_id "'"$ID"'";/exon_id "\1"; exon_version "\3";/' \ > "$gtf_modified" # Define string patterns for GTF tags # NOTES: # - Since GENCODE release 31/M22 (Ensembl 97), the "lincRNA" and "antisense" # biotypes are part of a more generic "lncRNA" biotype. # - These filters are relevant only to GTF files from GENCODE. The GTFs from # Ensembl release 98 have the following differences: # - The names "gene_biotype" and "transcript_biotype" are used instead of # "gene_type" and "transcript_type". # - Readthrough transcripts are present but are not marked with the # "readthrough_transcript" tag. BIOTYPE_PATTERN=\ "(protein_coding|lncRNA|\ IG_C_gene|IG_D_gene|IG_J_gene|IG_LV_gene|IG_V_gene|\ IG_V_pseudogene|IG_J_pseudogene|IG_C_pseudogene|\ TR_C_gene|TR_D_gene|TR_J_gene|TR_V_gene|\ TR_V_pseudogene|TR_J_pseudogene)" GENE_PATTERN="gene_type \"${BIOTYPE_PATTERN}\"" TX_PATTERN="transcript_type \"${BIOTYPE_PATTERN}\"" READTHROUGH_PATTERN="tag \"readthrough_transcript\"" # Construct the gene ID allowlist. We filter the list of all transcripts # based on these criteria: # - allowable gene_type (biotype) # - allowable transcript_type (biotype) # - no "readthrough_transcript" tag # We then collect the list of gene IDs that have at least one associated # transcript passing the filters. cat "$gtf_modified" \ | awk '$3 == "transcript"' \ | grep -E "$GENE_PATTERN" \ | grep -E "$TX_PATTERN" \ | grep -Ev "$READTHROUGH_PATTERN" \ | sed -E 's/.*(gene_id "[^"]+").*/\1/' \ | sort \ | uniq \ > "${build}/gene_allowlist" # Filter the GTF file based on the gene allowlist gtf_filtered="${build}/$(basename "$gtf_in").filtered" # Copy header lines beginning with "#" grep -E "^#" "$gtf_modified" > "$gtf_filtered" # Filter to the gene allowlist grep -Ff "${build}/gene_allowlist" "$gtf_modified" \ >> "$gtf_filtered" # Create reference package cellranger mkref --ref-version="$version" \ --genome="$genome" --fasta="$fasta_modified" --genes="$gtf_filtered"
#################### SETUP #################### human_genome="GRCh38" mouse_genome="mm10" version="2020-A" build="GRCh38_and_mm10-2020-A_build" mkdir -p "$build" # Download source files if they do not exist in reference_sources/ folder source="reference_sources" mkdir -p "$source" human_fasta_url="http://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" human_fasta_in="${source}/Homo_sapiens.GRCh38.dna.primary_assembly.fa" human_gtf_url="http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.primary_assembly.annotation.gtf.gz" human_gtf_in="${source}/gencode.v32.primary_assembly.annotation.gtf" mouse_fasta_url="http://ftp.ensembl.org/pub/release-98/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz" mouse_fasta_in="${source}/Mus_musculus.GRCm38.dna.primary_assembly.fa" mouse_gtf_url="http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M23/gencode.vM23.primary_assembly.annotation.gtf.gz" mouse_gtf_in="${source}/gencode.vM23.primary_assembly.annotation.gtf" if [ ! -f "$human_fasta_in" ]; then curl -sS "$human_fasta_url" | zcat > "$human_fasta_in" fi if [ ! -f "$human_gtf_in" ]; then curl -sS "$human_gtf_url" | zcat > "$human_gtf_in" fi if [ ! -f "$mouse_fasta_in" ]; then curl -sS "$mouse_fasta_url" | zcat > "$mouse_fasta_in" fi if [ ! -f "$mouse_gtf_in" ]; then curl -sS "$mouse_gtf_url" | zcat > "$mouse_gtf_in" fi # String patterns used for both genomes ID="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)" BIOTYPE_PATTERN=\ "(protein_coding|lncRNA|\ IG_C_gene|IG_D_gene|IG_J_gene|IG_LV_gene|IG_V_gene|\ IG_V_pseudogene|IG_J_pseudogene|IG_C_pseudogene|\ TR_C_gene|TR_D_gene|TR_J_gene|TR_V_gene|\ TR_V_pseudogene|TR_J_pseudogene)" GENE_PATTERN="gene_type \"${BIOTYPE_PATTERN}\"" TX_PATTERN="transcript_type \"${BIOTYPE_PATTERN}\"" READTHROUGH_PATTERN="tag \"readthrough_transcript\"" PAR_PATTERN="tag \"PAR\"" #################### HUMAN #################### # Please see the GRCh38-2020-A build documentation for details on these steps. # Process FASTA -- translate chromosome names human_fasta_modified="$build/$(basename "$human_fasta_in").modified" cat "$human_fasta_in" \ | sed -E 's/^>(\S+).*/>\1 \1/' \ | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \ | sed -E 's/^>MT />chrM /' \ > "$human_fasta_modified" # Process GTF -- split Ensembl IDs from version suffixes human_gtf_modified="$build/$(basename "$human_gtf_in").modified" cat "$human_gtf_in" \ | sed -E 's/gene_id "'"$ID"'";/gene_id "\1"; gene_version "\3";/' \ | sed -E 's/transcript_id "'"$ID"'";/transcript_id "\1"; transcript_version "\3";/' \ | sed -E 's/exon_id "'"$ID"'";/exon_id "\1"; exon_version "\3";/' \ > "$human_gtf_modified" # Process GTF -- filter based on gene/transcript tags cat "$human_gtf_modified" \ | awk '$3 == "transcript"' \ | grep -E "$GENE_PATTERN" \ | grep -E "$TX_PATTERN" \ | grep -Ev "$READTHROUGH_PATTERN" \ | grep -Ev "$PAR_PATTERN" \ | sed -E 's/.*(gene_id "[^"]+").*/\1/' \ | sort \ | uniq \ > "${build}/gene_allowlist" human_gtf_filtered="${build}/$(basename "$human_gtf_in").filtered" grep -E "^#" "$human_gtf_modified" > "$human_gtf_filtered" grep -Ff "${build}/gene_allowlist" "$human_gtf_modified" \ >> "$human_gtf_filtered" #################### MOUSE #################### # Please see the mm10-2020-A build documentation for details on these steps. # Process FASTA -- translate chromosome names mouse_fasta_modified="$build/$(basename "$mouse_fasta_in").modified" cat "$mouse_fasta_in" \ | sed -E 's/^>(\S+).*/>\1 \1/' \ | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \ | sed -E 's/^>MT />chrM /' \ > "$mouse_fasta_modified" # Process GTF -- split Ensembl IDs from version suffixes mouse_gtf_modified="$build/$(basename "$mouse_gtf_in").modified" cat "$mouse_gtf_in" \ | sed -E 's/gene_id "'"$ID"'";/gene_id "\1"; gene_version "\3";/' \ | sed -E 's/transcript_id "'"$ID"'";/transcript_id "\1"; transcript_version "\3";/' \ | sed -E 's/exon_id "'"$ID"'";/exon_id "\1"; exon_version "\3";/' \ > "$mouse_gtf_modified" # Process GTF -- filter based on gene/transcript tags cat "$mouse_gtf_modified" \ | awk '$3 == "transcript"' \ | grep -E "$GENE_PATTERN" \ | grep -E "$TX_PATTERN" \ | grep -Ev "$READTHROUGH_PATTERN" \ | sed -E 's/.*(gene_id "[^"]+").*/\1/' \ | sort \ | uniq \ > "${build}/gene_allowlist" mouse_gtf_filtered="${build}/$(basename "$mouse_gtf_in").filtered" grep -E "^#" "$mouse_gtf_modified" > "$mouse_gtf_filtered" grep -Ff "${build}/gene_allowlist" "$mouse_gtf_modified" \ >> "$mouse_gtf_filtered" #################### MKREF #################### cellranger mkref --ref-version="$version" \ --genome="$human_genome" --fasta="$human_fasta_modified" --genes="$human_gtf_filtered" \ --genome="$mouse_genome" --fasta="$mouse_fasta_modified" --genes="$mouse_gtf_filtered"
wget ftp://ftp.ensembl.org/pub/release-93/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz gunzip Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/release-93/gtf/homo_sapiens/Homo_sapiens.GRCh38.93.gtf.gz gunzip Homo_sapiens.GRCh38.93.gtf.gz wget ftp://ftp.ensembl.org/pub/release-93/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz gunzip Mus_musculus.GRCm38.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/release-93/gtf/mus_musculus/Mus_musculus.GRCm38.93.gtf.gz gunzip Mus_musculus.GRCm38.93.gtf.gz cellranger mkgtf Homo_sapiens.GRCh38.93.gtf Homo_sapiens.GRCh38.93.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense \ --attribute=gene_biotype:IG_LV_gene \ --attribute=gene_biotype:IG_V_gene \ --attribute=gene_biotype:IG_V_pseudogene \ --attribute=gene_biotype:IG_D_gene \ --attribute=gene_biotype:IG_J_gene \ --attribute=gene_biotype:IG_J_pseudogene \ --attribute=gene_biotype:IG_C_gene \ --attribute=gene_biotype:IG_C_pseudogene \ --attribute=gene_biotype:TR_V_gene \ --attribute=gene_biotype:TR_V_pseudogene \ --attribute=gene_biotype:TR_D_gene \ --attribute=gene_biotype:TR_J_gene \ --attribute=gene_biotype:TR_J_pseudogene \ --attribute=gene_biotype:TR_C_gene cellranger mkgtf Mus_musculus.GRCm38.93.gtf Mus_musculus.GRCm38.93.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense \ --attribute=gene_biotype:IG_LV_gene \ --attribute=gene_biotype:IG_V_gene \ --attribute=gene_biotype:IG_V_pseudogene \ --attribute=gene_biotype:IG_D_gene \ --attribute=gene_biotype:IG_J_gene \ --attribute=gene_biotype:IG_J_pseudogene \ --attribute=gene_biotype:IG_C_gene \ --attribute=gene_biotype:IG_C_pseudogene \ --attribute=gene_biotype:TR_V_gene \ --attribute=gene_biotype:TR_V_pseudogene \ --attribute=gene_biotype:TR_D_gene \ --attribute=gene_biotype:TR_J_gene \ --attribute=gene_biotype:TR_J_pseudogene \ --attribute=gene_biotype:TR_C_gene cellranger mkref --genome=GRCh38 \ --fasta=Homo_sapiens.GRCh38.dna.primary_assembly.fa \ --genes=Homo_sapiens.GRCh38.93.filtered.gtf \ --genome=mm10 \ --fasta=Mus_musculus.GRCm38.dna.primary_assembly.fa \ --genes=Mus_musculus.GRCm38.93.filtered.gtf \ --ref-version=3.1.0
wget ftp://ftp.ensembl.org/pub/release-93/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz gunzip Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/release-93/gtf/homo_sapiens/Homo_sapiens.GRCh38.93.gtf.gz gunzip Homo_sapiens.GRCh38.93.gtf.gz cellranger mkgtf Homo_sapiens.GRCh38.93.gtf Homo_sapiens.GRCh38.93.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense \ --attribute=gene_biotype:IG_LV_gene \ --attribute=gene_biotype:IG_V_gene \ --attribute=gene_biotype:IG_V_pseudogene \ --attribute=gene_biotype:IG_D_gene \ --attribute=gene_biotype:IG_J_gene \ --attribute=gene_biotype:IG_J_pseudogene \ --attribute=gene_biotype:IG_C_gene \ --attribute=gene_biotype:IG_C_pseudogene \ --attribute=gene_biotype:TR_V_gene \ --attribute=gene_biotype:TR_V_pseudogene \ --attribute=gene_biotype:TR_D_gene \ --attribute=gene_biotype:TR_J_gene \ --attribute=gene_biotype:TR_J_pseudogene \ --attribute=gene_biotype:TR_C_gene cellranger mkref --genome=GRCh38 \ --fasta=Homo_sapiens.GRCh38.dna.primary_assembly.fa \ --genes=Homo_sapiens.GRCh38.93.filtered.gtf \ --ref-version=3.0.0
wget ftp://ftp.ensembl.org/pub/grch37/release-87/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz gunzip Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz gunzip Homo_sapiens.GRCh37.87.gtf.gz cellranger mkgtf Homo_sapiens.GRCh37.87.gtf Homo_sapiens.GRCh37.87.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense cellranger mkref --genome=hg19 \ --fasta=Homo_sapiens.GRCh37.dna.primary_assembly.fa \ --genes=Homo_sapiens.GRCh37.87.filtered.gtf \ --ref-version=3.0.0
wget ftp://ftp.ensembl.org/pub/release-93/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz gunzip Mus_musculus.GRCm38.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/release-93/gtf/mus_musculus/Mus_musculus.GRCm38.93.gtf.gz gunzip Mus_musculus.GRCm38.93.gtf.gz cellranger mkgtf Mus_musculus.GRCm38.93.gtf Mus_musculus.GRCm38.93.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense \ --attribute=gene_biotype:IG_LV_gene \ --attribute=gene_biotype:IG_V_gene \ --attribute=gene_biotype:IG_V_pseudogene \ --attribute=gene_biotype:IG_D_gene \ --attribute=gene_biotype:IG_J_gene \ --attribute=gene_biotype:IG_J_pseudogene \ --attribute=gene_biotype:IG_C_gene \ --attribute=gene_biotype:IG_C_pseudogene \ --attribute=gene_biotype:TR_V_gene \ --attribute=gene_biotype:TR_V_pseudogene \ --attribute=gene_biotype:TR_D_gene \ --attribute=gene_biotype:TR_J_gene \ --attribute=gene_biotype:TR_J_pseudogene \ --attribute=gene_biotype:TR_C_gene cellranger mkref --genome=mm10 \ --fasta=Mus_musculus.GRCm38.dna.primary_assembly.fa \ --genes=Mus_musculus.GRCm38.93.filtered.gtf \ --ref-version=3.0.0
wget ftp://ftp.ensembl.org/pub/grch37/release-87/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz gunzip Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz gunzip Homo_sapiens.GRCh37.87.gtf.gz wget ftp://ftp.ensembl.org/pub/release-93/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz gunzip Mus_musculus.GRCm38.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/release-93/gtf/mus_musculus/Mus_musculus.GRCm38.93.gtf.gz gunzip Mus_musculus.GRCm38.93.gtf.gz cellranger mkgtf Homo_sapiens.GRCh37.87.gtf Homo_sapiens.GRCh37.87.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense \ --attribute=gene_biotype:IG_LV_gene \ --attribute=gene_biotype:IG_V_gene \ --attribute=gene_biotype:IG_V_pseudogene \ --attribute=gene_biotype:IG_D_gene \ --attribute=gene_biotype:IG_J_gene \ --attribute=gene_biotype:IG_J_pseudogene \ --attribute=gene_biotype:IG_C_gene \ --attribute=gene_biotype:IG_C_pseudogene \ --attribute=gene_biotype:TR_V_gene \ --attribute=gene_biotype:TR_V_pseudogene \ --attribute=gene_biotype:TR_D_gene \ --attribute=gene_biotype:TR_J_gene \ --attribute=gene_biotype:TR_J_pseudogene \ --attribute=gene_biotype:TR_C_gene cellranger mkgtf Mus_musculus.GRCm38.93.gtf Mus_musculus.GRCm38.93.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense \ --attribute=gene_biotype:IG_LV_gene \ --attribute=gene_biotype:IG_V_gene \ --attribute=gene_biotype:IG_V_pseudogene \ --attribute=gene_biotype:IG_D_gene \ --attribute=gene_biotype:IG_J_gene \ --attribute=gene_biotype:IG_J_pseudogene \ --attribute=gene_biotype:IG_C_gene \ --attribute=gene_biotype:IG_C_pseudogene \ --attribute=gene_biotype:TR_V_gene \ --attribute=gene_biotype:TR_V_pseudogene \ --attribute=gene_biotype:TR_D_gene \ --attribute=gene_biotype:TR_J_gene \ --attribute=gene_biotype:TR_J_pseudogene \ --attribute=gene_biotype:TR_C_gene cellranger mkref --genome=hg19 \ --fasta=Homo_sapiens.GRCh37.dna.primary_assembly.fa \ --genes=Homo_sapiens.GRCh37.87.filtered.gtf \ --genome=mm10 \ --fasta=Mus_musculus.GRCm38.dna.primary_assembly.fa \ --genes=Mus_musculus.GRCm38.93.filtered.gtf \ --ref-version=3.0.0
wget ftp://ftp.ensembl.org/pub/release-84/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz gunzip Mus_musculus.GRCm38.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/release-84/gtf/mus_musculus/Mus_musculus.GRCm38.84.gtf.gz gunzip Mus_musculus.GRCm38.84.gtf.gz cellranger mkgtf Mus_musculus.GRCm38.84.gtf Mus_musculus.GRCm38.84.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense \ --attribute=gene_biotype:IG_LV_gene \ --attribute=gene_biotype:IG_V_gene \ --attribute=gene_biotype:IG_V_pseudogene \ --attribute=gene_biotype:IG_D_gene \ --attribute=gene_biotype:IG_J_gene \ --attribute=gene_biotype:IG_J_pseudogene \ --attribute=gene_biotype:IG_C_gene \ --attribute=gene_biotype:IG_C_pseudogene \ --attribute=gene_biotype:TR_V_gene \ --attribute=gene_biotype:TR_V_pseudogene \ --attribute=gene_biotype:TR_D_gene \ --attribute=gene_biotype:TR_J_gene \ --attribute=gene_biotype:TR_J_pseudogene \ --attribute=gene_biotype:TR_C_gene cellranger mkref --genome=mm10 \ --fasta=Mus_musculus.GRCm38.dna.primary_assembly.fa \ --genes=Mus_musculus.GRCm38.84.filtered.gtf \ --ref-version=2.1.0
wget ftp://ftp.ensembl.org/pub/grch37/release-84/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz gunzip Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/grch37/release-84/gtf/homo_sapiens/Homo_sapiens.GRCh37.82.gtf.gz gunzip Homo_sapiens.GRCh37.82.gtf.gz wget ftp://ftp.ensembl.org/pub/release-84/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz gunzip Mus_musculus.GRCm38.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/release-84/gtf/mus_musculus/Mus_musculus.GRCm38.84.gtf.gz gunzip Mus_musculus.GRCm38.84.gtf.gz cellranger mkgtf Homo_sapiens.GRCh37.82.gtf Homo_sapiens.GRCh37.82.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense \ --attribute=gene_biotype:IG_LV_gene \ --attribute=gene_biotype:IG_V_gene \ --attribute=gene_biotype:IG_V_pseudogene \ --attribute=gene_biotype:IG_D_gene \ --attribute=gene_biotype:IG_J_gene \ --attribute=gene_biotype:IG_J_pseudogene \ --attribute=gene_biotype:IG_C_gene \ --attribute=gene_biotype:IG_C_pseudogene \ --attribute=gene_biotype:TR_V_gene \ --attribute=gene_biotype:TR_V_pseudogene \ --attribute=gene_biotype:TR_D_gene \ --attribute=gene_biotype:TR_J_gene \ --attribute=gene_biotype:TR_J_pseudogene \ --attribute=gene_biotype:TR_C_gene cellranger mkgtf Mus_musculus.GRCm38.84.gtf Mus_musculus.GRCm38.84.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense \ --attribute=gene_biotype:IG_LV_gene \ --attribute=gene_biotype:IG_V_gene \ --attribute=gene_biotype:IG_V_pseudogene \ --attribute=gene_biotype:IG_D_gene \ --attribute=gene_biotype:IG_J_gene \ --attribute=gene_biotype:IG_J_pseudogene \ --attribute=gene_biotype:IG_C_gene \ --attribute=gene_biotype:IG_C_pseudogene \ --attribute=gene_biotype:TR_V_gene \ --attribute=gene_biotype:TR_V_pseudogene \ --attribute=gene_biotype:TR_D_gene \ --attribute=gene_biotype:TR_J_gene \ --attribute=gene_biotype:TR_J_pseudogene \ --attribute=gene_biotype:TR_C_gene cellranger mkref --genome=hg19 \ --fasta=Homo_sapiens.GRCh37.dna.primary_assembly.fa \ --genes=Homo_sapiens.GRCh37.82.filtered.gtf \ --genome=mm10 \ --fasta=Mus_musculus.GRCm38.dna.primary_assembly.fa \ --genes=Mus_musculus.GRCm38.84.filtered.gtf \ --ref-version=2.1.0
wget ftp://ftp.ensembl.org/pub/release-84/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz gunzip Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/release-84/gtf/homo_sapiens/Homo_sapiens.GRCh38.84.gtf.gz gunzip Homo_sapiens.GRCh38.84.gtf.gz cellranger mkgtf Homo_sapiens.GRCh38.84.gtf Homo_sapiens.GRCh38.84.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense \ --attribute=gene_biotype:IG_LV_gene \ --attribute=gene_biotype:IG_V_gene \ --attribute=gene_biotype:IG_V_pseudogene \ --attribute=gene_biotype:IG_D_gene \ --attribute=gene_biotype:IG_J_gene \ --attribute=gene_biotype:IG_J_pseudogene \ --attribute=gene_biotype:IG_C_gene \ --attribute=gene_biotype:IG_C_pseudogene \ --attribute=gene_biotype:TR_V_gene \ --attribute=gene_biotype:TR_V_pseudogene \ --attribute=gene_biotype:TR_D_gene \ --attribute=gene_biotype:TR_J_gene \ --attribute=gene_biotype:TR_J_pseudogene \ --attribute=gene_biotype:TR_C_gene cellranger mkref --genome=GRCh38 \ --fasta=Homo_sapiens.GRCh38.dna.primary_assembly.fa \ --genes=Homo_sapiens.GRCh38.84.filtered.gtf \ --ref-version=1.2.0
wget ftp://ftp.ensembl.org/pub/grch37/release-84/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz gunzip Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/grch37/release-84/gtf/homo_sapiens/Homo_sapiens.GRCh37.82.gtf.gz gunzip Homo_sapiens.GRCh37.82.gtf.gz cellranger mkgtf Homo_sapiens.GRCh37.82.gtf Homo_sapiens.GRCh37.82.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense cellranger mkref --genome=hg19 \ --fasta=Homo_sapiens.GRCh37.dna.primary_assembly.fa \ --genes=Homo_sapiens.GRCh37.82.filtered.gtf \ --ref-version=1.2.0
wget ftp://ftp.ensembl.org/pub/release-84/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz gunzip Mus_musculus.GRCm38.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/release-84/gtf/mus_musculus/Mus_musculus.GRCm38.84.gtf.gz gunzip Mus_musculus.GRCm38.84.gtf.gz cellranger mkgtf Mus_musculus.GRCm38.84.gtf Mus_musculus.GRCm38.84.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense cellranger mkref --genome=mm10 \ --fasta=Mus_musculus.GRCm38.dna.primary_assembly.fa \ --genes=Mus_musculus.GRCm38.84.filtered.gtf \ --ref-version=1.2.0
wget ftp://ftp.ensembl.org/pub/grch37/release-84/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz gunzip Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/grch37/release-84/gtf/homo_sapiens/Homo_sapiens.GRCh37.82.gtf.gz gunzip Homo_sapiens.GRCh37.82.gtf.gz wget ftp://ftp.ensembl.org/pub/release-84/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz gunzip Mus_musculus.GRCm38.dna.primary_assembly.fa.gz wget ftp://ftp.ensembl.org/pub/release-84/gtf/mus_musculus/Mus_musculus.GRCm38.84.gtf.gz gunzip Mus_musculus.GRCm38.84.gtf.gz cellranger mkgtf Homo_sapiens.GRCh37.82.gtf Homo_sapiens.GRCh37.82.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense cellranger mkgtf Mus_musculus.GRCm38.84.gtf Mus_musculus.GRCm38.84.filtered.gtf \ --attribute=gene_biotype:protein_coding \ --attribute=gene_biotype:lincRNA \ --attribute=gene_biotype:antisense cellranger mkref --genome=hg19 \ --fasta=Homo_sapiens.GRCh37.dna.primary_assembly.fa \ --genes=Homo_sapiens.GRCh37.82.filtered.gtf \ --genome=mm10 \ --fasta=Mus_musculus.GRCm38.dna.primary_assembly.fa \ --genes=Mus_musculus.GRCm38.84
cellranger mkref --genome=ercc92 \ --fasta=ercc92.fa \ --genes=ercc92.gtf \ --ref-version=1.2.0