Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ private CellBaseBuilder buildConservation() {

private CellBaseBuilder buildClinicalVariants() {
Path clinicalVariantFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER);
copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve("clinvarVersion.json")));
copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve(CLINVAR_VERSION_FILENAME)));
copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve("gwasVersion.json")));

CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ private void loadClinical() throws FileNotFoundException {

// Update release (collection and sources)
List<Path> sources = new ArrayList<>(Arrays.asList(
input.resolve("clinvarVersion.json"),
input.resolve(CLINVAR_VERSION_FILENAME),
input.resolve("cosmicVersion.json"),
input.resolve("gwasVersion.json")
));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,18 @@ public class EtlCommons {
public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json";

public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant";
public static final String CLINVAR_VERSION = "2024-05";
public static final String CLINVAR_DATE = "2024-05";
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2024-05.xml.gz";
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease.xml.gz";
public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv";
public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz";
public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz";
public static final String CLINVAR_VERSION_FILENAME = "clinvarVersion.json";

public static final String IARCTP53_FILE = "IARC-TP53.zip";
public static final String GWAS_FILE = "gwas_catalog.tsv";
public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz";
public static final String COSMIC_VERSION_FILENAME = "cosmicVersion.json";
public static final String HGMD_VERSION_FILENAME = "hgmdVersion.json";

@Deprecated
public static final String DBSNP_FILE = "GCF_000001405.40.gz";
public static final String DBSNP_NAME = "dbSNP";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@
package org.opencb.cellbase.lib.builders.clinical.variant;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.formats.variant.clinvar.rcv.ClinvarParser;
import org.opencb.biodata.formats.variant.clinvar.rcv.v64jaxb.*;
import org.opencb.biodata.models.sequence.SequenceLocation;
import org.opencb.biodata.models.variant.avro.*;
import org.opencb.cellbase.core.models.DataReleaseSource;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.cellbase.lib.variant.VariantAnnotationUtils;
import org.opencb.commons.ProgressLogger;
Expand All @@ -42,8 +45,7 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_DATE;
import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_VERSION;
import static org.opencb.cellbase.lib.EtlCommons.*;

//import org.opencb.biodata.formats.variant.clinvar.v24jaxb.*;

Expand Down Expand Up @@ -84,6 +86,10 @@ public class ClinVarIndexer extends ClinicalIndexer {
private final Path clinvarVariationAlleleFile;
private final Path clinvarEFOFile;
private final String assembly;

private String version;
private String date;

private int numberSomaticRecords = 0;
private int numberGermlineRecords = 0;
private int numberNoDiseaseTrait = 0;
Expand All @@ -99,18 +105,32 @@ public ClinVarIndexer(Path clinvarXMLFiles, Path clinvarSummaryFile, Path clinva
Path clinvarEFOFile, boolean normalize, Path genomeSequenceFilePath, String assembly,
RocksDB rdb) throws IOException {
super(genomeSequenceFilePath);
this.rdb = rdb;

this.clinvarXMLFiles = clinvarXMLFiles;
this.clinvarSummaryFile = clinvarSummaryFile;
this.clinvarVariationAlleleFile = clinvarVariationAlleleFile;
this.clinvarEFOFile = clinvarEFOFile;
this.normalize = normalize;
this.genomeSequenceFilePath = genomeSequenceFilePath;
this.assembly = assembly;

this.rdb = rdb;
}

public void index() throws RocksDBException {
try {
Path clinvarVersionPath = clinvarSummaryFile.getParent().resolve(CLINVAR_VERSION_FILENAME);
if (!Files.exists(clinvarVersionPath)) {
throw new IOException("ClinVar version file " + clinvarVersionPath + " does not exist");
}
ObjectMapper jsonObjectMapper = new ObjectMapper();
ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataReleaseSource.class);
DataReleaseSource dataReleaseSource = jsonObjectReader.readValue(clinvarVersionPath.toFile());

this.date = dataReleaseSource.getDate();
this.version = dataReleaseSource.getVersion();


Map<String, EFO> traitsToEfoTermsMap = loadEFOTerms();
Map<String, List<AlleleLocationData>> rcvToAlleleLocationData = parseVariantSummary(traitsToEfoTermsMap);

Expand Down Expand Up @@ -157,15 +177,9 @@ public boolean accept(File dir, String name) {
}
logger.info("Done");
printSummary();
} catch (RocksDBException e) {
logger.error("Error reading/writing from/to the RocksDB index while indexing ClinVar");
throw e;
} catch (JAXBException e) {
logger.error("Error unmarshalling clinvar Xml file: " + e.getMessage());
e.printStackTrace();
} catch (IOException e) {
logger.error("Error indexing clinvar Xml file: " + e.getMessage());
e.printStackTrace();
} catch (RocksDBException | JAXBException | IOException e) {
logger.error("Error indexing ClinVar", e);
throw new RocksDBException(e.getMessage());
}
}

Expand Down Expand Up @@ -332,7 +346,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, String variation
String mateVariantString, String clinicalHaplotypeString,
Map<String, EFO> traitsToEfoTermsMap) {

EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE);
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, date);
// Create a set to avoid situations like germline;germline;germline
List<AlleleOrigin> alleleOrigin = null;
if (!EtlCommons.isMissing(lineFields[VARIANT_SUMMARY_ORIGIN_COLUMN])) {
Expand Down Expand Up @@ -413,7 +427,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu
throws JsonProcessingException {

List<Property> additionalProperties = new ArrayList<>(3);
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE);
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, date);
// String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc();

VariantClassification variantClassification = getVariantClassification(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.cellbase.lib.builders.CellBaseBuilder;
import org.opencb.commons.utils.FileUtils;
import org.rocksdb.Options;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksDBException;
import org.rocksdb.RocksIterator;

import java.io.File;
import java.io.IOException;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
Expand Down Expand Up @@ -125,8 +125,17 @@ public void parse() throws IOException, RocksDBException, CellBaseException {
if (this.clinvarXMLFile != null && this.clinvarSummaryFile != null
&& this.clinvarVariationAlleleFile != null && Files.exists(clinvarXMLFile)
&& Files.exists(clinvarSummaryFile) && Files.exists(clinvarVariationAlleleFile)) {
ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent().resolve("clinvar_chunks"), clinvarSummaryFile,
clinvarVariationAlleleFile, clinvarEFOFile, normalize, genomeSequenceFilePath, assembly, rdb);

Path chunksPaths = clinvarXMLFile.getParent().resolve("clinvar_chunks");
if (Files.notExists(chunksPaths)) {
logger.info("Splitting ClinVar XML file in multiple ClinVar chunk files at {} ...", chunksPaths);
Files.createDirectories(chunksPaths);
splitClinvar(this.clinvarXMLFile, chunksPaths);
logger.info("Done");
}

ClinVarIndexer clinvarIndexer = new ClinVarIndexer(chunksPaths, clinvarSummaryFile, clinvarVariationAlleleFile,
clinvarEFOFile, normalize, genomeSequenceFilePath, assembly, rdb);
clinvarIndexer.index();
} else {
logger.warn("One or more of required ClinVar files are missing. Skipping ClinVar data.\n"
Expand Down Expand Up @@ -190,6 +199,48 @@ public void parse() throws IOException, RocksDBException, CellBaseException {

}

private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException {
BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath);
PrintWriter pw = null;
StringBuilder header = new StringBuilder();
boolean beforeEntry = true;
boolean inEntry = false;
int count = 0;
int chunk = 0;
String line;
while ((line = br.readLine()) != null) {
if (line.trim().startsWith("<ClinVarSet ")) {
inEntry = true;
beforeEntry = false;
if (count % 10000 == 0) {
pw = new PrintWriter(new FileOutputStream(splitOutdirPath.resolve("chunk_" + chunk + ".xml").toFile()));
pw.println(header.toString().trim());
}
count++;
}

if (beforeEntry) {
header.append(line).append("\n");
}

if (inEntry) {
pw.println(line);
}

if (line.trim().startsWith("</ClinVarSet>")) {
inEntry = false;
if (count % 10000 == 0) {
pw.print("</ReleaseSet>");
pw.close();
chunk++;
}
}
}
pw.print("</ReleaseSet>");
pw.close();
br.close();
}

private void serializeRDB(RocksDB rdb) throws IOException {
// DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's
// named "iterator"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,37 @@

package org.opencb.cellbase.lib.builders.clinical.variant;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.collections4.MapUtils;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantFileMetadata;
import org.opencb.biodata.models.variant.avro.*;
import org.opencb.biodata.models.variant.metadata.VariantStudyMetadata;
import org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader;
import org.opencb.cellbase.core.models.DataReleaseSource;
import org.opencb.cellbase.lib.EtlCommons;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksDBException;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;

import static org.opencb.cellbase.lib.EtlCommons.HGMD_VERSION_FILENAME;

/**
* Created by jtarraga on 23/02/22.
*/
public class HGMDIndexer extends ClinicalIndexer {
private final Path hgmdFile;
private final String assembly;

private String date;
private String version;

public HGMDIndexer(Path hgmdFile, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb)
throws IOException {
super(genomeSequenceFilePath);
Expand All @@ -51,6 +60,18 @@ public void index() throws RocksDBException, IOException {
logger.info("Parsing HGMD file ...");

try {

Path hgmdVersionPath = hgmdFile.getParent().resolve(HGMD_VERSION_FILENAME);
if (!Files.exists(hgmdVersionPath)) {
throw new IOException("HGMD version file " + hgmdVersionPath + " does not exist");
}
ObjectMapper jsonObjectMapper = new ObjectMapper();
ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataReleaseSource.class);
DataReleaseSource dataReleaseSource = jsonObjectReader.readValue(hgmdVersionPath.toFile());

this.date = dataReleaseSource.getDate();
this.version = dataReleaseSource.getVersion();

VariantStudyMetadata metadata = new VariantFileMetadata(null, hgmdFile.toString()).toVariantStudyMetadata("study");
VariantVcfHtsjdkReader reader = new VariantVcfHtsjdkReader(hgmdFile.toAbsolutePath(), metadata);
for (Variant variant : reader) {
Expand All @@ -74,7 +95,6 @@ public void index() throws RocksDBException, IOException {
throw e;
} finally {
logger.info("Done");

// this.printSummary();
}
}
Expand All @@ -93,7 +113,7 @@ private void parseHgmdInfo(Variant variant) {
}

// Source
entry.setSource(new EvidenceSource(EtlCommons.HGMD_DATA, "2020.3", "2020"));
entry.setSource(new EvidenceSource(EtlCommons.HGMD_DATA, version, date));

// Assembly
entry.setAssembly(assembly);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@
import org.opencb.cellbase.core.config.DownloadProperties;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.commons.utils.FileUtils;

import javax.ws.rs.client.Client;
import javax.ws.rs.client.ClientBuilder;
import javax.ws.rs.client.WebTarget;
import java.io.*;
import java.io.BufferedWriter;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
Expand All @@ -34,6 +34,8 @@
import java.util.List;
import java.util.Map;

import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_VERSION_FILENAME;

public class ClinicalDownloadManager extends AbstractDownloadManager {

private static final String CLINVAR_NAME = "ClinVar";
Expand Down Expand Up @@ -82,7 +84,7 @@ public List<DownloadFile> downloadClinical() throws IOException, InterruptedExce
downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE).toString()));
clinvarUrls.add(url);
saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, configuration.getDownload().getClinvar()
.getVersion(), getTimeStamp(), clinvarUrls, clinicalFolder.resolve("clinvarVersion.json"));
.getVersion(), getTimeStamp(), clinvarUrls, clinicalFolder.resolve(CLINVAR_VERSION_FILENAME));

logger.info("\t\tDone");

Expand Down Expand Up @@ -137,58 +139,11 @@ public List<DownloadFile> downloadClinical() throws IOException, InterruptedExce
// Collections.singletonList(url), clinicalFolder.resolve("iarctp53Version.json"));
// }

if (Files.notExists(clinicalFolder.resolve("clinvar_chunks"))) {
Files.createDirectories(clinicalFolder.resolve("clinvar_chunks"));
splitClinvar(clinicalFolder.resolve(EtlCommons.CLINVAR_XML_FILE), clinicalFolder.resolve("clinvar_chunks"));
}

return downloadFiles;
}
return null;
}

private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException {
BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath);
PrintWriter pw = null;
StringBuilder header = new StringBuilder();
boolean beforeEntry = true;
boolean inEntry = false;
int count = 0;
int chunk = 0;
String line;
while ((line = br.readLine()) != null) {
if (line.trim().startsWith("<ClinVarSet ")) {
inEntry = true;
beforeEntry = false;
if (count % 10000 == 0) {
pw = new PrintWriter(new FileOutputStream(splitOutdirPath.resolve("chunk_" + chunk + ".xml").toFile()));
pw.println(header.toString().trim());
}
count++;
}

if (beforeEntry) {
header.append(line).append("\n");
}

if (inEntry) {
pw.println(line);
}

if (line.trim().startsWith("</ClinVarSet>")) {
inEntry = false;
if (count % 10000 == 0) {
pw.print("</ReleaseSet>");
pw.close();
chunk++;
}
}
}
pw.print("</ReleaseSet>");
pw.close();
br.close();
}

private String getDocmVersion(Path docmIndexHtml) {
return getVersionFromVersionLine(docmIndexHtml, "<select name=\"version\" id=\"version\"");
}
Expand Down
Loading
Loading