diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 16db1f82bc..70e12bc1d9 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -367,7 +367,7 @@ private CellBaseBuilder buildConservation() { private CellBaseBuilder buildClinicalVariants() { Path clinicalVariantFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER); - copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve("clinvarVersion.json"))); + copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve(CLINVAR_VERSION_FILENAME))); copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve("gwasVersion.json"))); CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 97460d5a71..7a7bdab168 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -484,7 +484,7 @@ private void loadClinical() throws FileNotFoundException { // Update release (collection and sources) List sources = new ArrayList<>(Arrays.asList( - input.resolve("clinvarVersion.json"), + input.resolve(CLINVAR_VERSION_FILENAME), input.resolve("cosmicVersion.json"), input.resolve("gwasVersion.json") )); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index d09291bc3e..630393dd2b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -56,15 +56,18 @@ public class EtlCommons { public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json"; public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant"; - public static final String CLINVAR_VERSION = "2024-05"; - public static final String CLINVAR_DATE = "2024-05"; - public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2024-05.xml.gz"; + public static final String CLINVAR_XML_FILE = "ClinVarFullRelease.xml.gz"; public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv"; public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz"; public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz"; + public static final String CLINVAR_VERSION_FILENAME = "clinvarVersion.json"; + public static final String IARCTP53_FILE = "IARC-TP53.zip"; public static final String GWAS_FILE = "gwas_catalog.tsv"; public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz"; + public static final String COSMIC_VERSION_FILENAME = "cosmicVersion.json"; + public static final String HGMD_VERSION_FILENAME = "hgmdVersion.json"; + @Deprecated public static final String DBSNP_FILE = "GCF_000001405.40.gz"; public static final String DBSNP_NAME = "dbSNP"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java index 39cddb4fcf..5cedc5cc3c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java @@ -17,11 +17,14 @@ package org.opencb.cellbase.lib.builders.clinical.variant; import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.formats.variant.clinvar.rcv.ClinvarParser; import org.opencb.biodata.formats.variant.clinvar.rcv.v64jaxb.*; import org.opencb.biodata.models.sequence.SequenceLocation; import org.opencb.biodata.models.variant.avro.*; +import org.opencb.cellbase.core.models.DataReleaseSource; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; import org.opencb.commons.ProgressLogger; @@ -42,8 +45,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_DATE; -import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_VERSION; +import static org.opencb.cellbase.lib.EtlCommons.*; //import org.opencb.biodata.formats.variant.clinvar.v24jaxb.*; @@ -84,6 +86,10 @@ public class ClinVarIndexer extends ClinicalIndexer { private final Path clinvarVariationAlleleFile; private final Path clinvarEFOFile; private final String assembly; + + private String version; + private String date; + private int numberSomaticRecords = 0; private int numberGermlineRecords = 0; private int numberNoDiseaseTrait = 0; @@ -99,7 +105,7 @@ public ClinVarIndexer(Path clinvarXMLFiles, Path clinvarSummaryFile, Path clinva Path clinvarEFOFile, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException { super(genomeSequenceFilePath); - this.rdb = rdb; + this.clinvarXMLFiles = clinvarXMLFiles; this.clinvarSummaryFile = clinvarSummaryFile; this.clinvarVariationAlleleFile = clinvarVariationAlleleFile; @@ -107,10 +113,24 @@ public ClinVarIndexer(Path clinvarXMLFiles, Path clinvarSummaryFile, Path clinva this.normalize = normalize; this.genomeSequenceFilePath = genomeSequenceFilePath; this.assembly = assembly; + + this.rdb = rdb; } public void index() throws RocksDBException { try { + Path clinvarVersionPath = clinvarSummaryFile.getParent().resolve(CLINVAR_VERSION_FILENAME); + if (!Files.exists(clinvarVersionPath)) { + throw new IOException("ClinVar version file " + clinvarVersionPath + " does not exist"); + } + ObjectMapper jsonObjectMapper = new ObjectMapper(); + ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataReleaseSource.class); + DataReleaseSource dataReleaseSource = jsonObjectReader.readValue(clinvarVersionPath.toFile()); + + this.date = dataReleaseSource.getDate(); + this.version = dataReleaseSource.getVersion(); + + Map traitsToEfoTermsMap = loadEFOTerms(); Map> rcvToAlleleLocationData = parseVariantSummary(traitsToEfoTermsMap); @@ -157,15 +177,9 @@ public boolean accept(File dir, String name) { } logger.info("Done"); printSummary(); - } catch (RocksDBException e) { - logger.error("Error reading/writing from/to the RocksDB index while indexing ClinVar"); - throw e; - } catch (JAXBException e) { - logger.error("Error unmarshalling clinvar Xml file: " + e.getMessage()); - e.printStackTrace(); - } catch (IOException e) { - logger.error("Error indexing clinvar Xml file: " + e.getMessage()); - e.printStackTrace(); + } catch (RocksDBException | JAXBException | IOException e) { + logger.error("Error indexing ClinVar", e); + throw new RocksDBException(e.getMessage()); } } @@ -332,7 +346,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, String variation String mateVariantString, String clinicalHaplotypeString, Map traitsToEfoTermsMap) { - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, date); // Create a set to avoid situations like germline;germline;germline List alleleOrigin = null; if (!EtlCommons.isMissing(lineFields[VARIANT_SUMMARY_ORIGIN_COLUMN])) { @@ -413,7 +427,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu throws JsonProcessingException { List additionalProperties = new ArrayList<>(3); - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, date); // String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); VariantClassification variantClassification = getVariantClassification( diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java index b1e758ab79..870fe2659a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java @@ -23,13 +23,13 @@ import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.builders.CellBaseBuilder; +import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; import org.rocksdb.RocksIterator; -import java.io.File; -import java.io.IOException; +import java.io.*; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -125,8 +125,17 @@ public void parse() throws IOException, RocksDBException, CellBaseException { if (this.clinvarXMLFile != null && this.clinvarSummaryFile != null && this.clinvarVariationAlleleFile != null && Files.exists(clinvarXMLFile) && Files.exists(clinvarSummaryFile) && Files.exists(clinvarVariationAlleleFile)) { - ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent().resolve("clinvar_chunks"), clinvarSummaryFile, - clinvarVariationAlleleFile, clinvarEFOFile, normalize, genomeSequenceFilePath, assembly, rdb); + + Path chunksPaths = clinvarXMLFile.getParent().resolve("clinvar_chunks"); + if (Files.notExists(chunksPaths)) { + logger.info("Splitting ClinVar XML file in multiple ClinVar chunk files at {} ...", chunksPaths); + Files.createDirectories(chunksPaths); + splitClinvar(this.clinvarXMLFile, chunksPaths); + logger.info("Done"); + } + + ClinVarIndexer clinvarIndexer = new ClinVarIndexer(chunksPaths, clinvarSummaryFile, clinvarVariationAlleleFile, + clinvarEFOFile, normalize, genomeSequenceFilePath, assembly, rdb); clinvarIndexer.index(); } else { logger.warn("One or more of required ClinVar files are missing. Skipping ClinVar data.\n" @@ -190,6 +199,48 @@ public void parse() throws IOException, RocksDBException, CellBaseException { } + private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException { + BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath); + PrintWriter pw = null; + StringBuilder header = new StringBuilder(); + boolean beforeEntry = true; + boolean inEntry = false; + int count = 0; + int chunk = 0; + String line; + while ((line = br.readLine()) != null) { + if (line.trim().startsWith("")) { + inEntry = false; + if (count % 10000 == 0) { + pw.print(""); + pw.close(); + chunk++; + } + } + } + pw.print(""); + pw.close(); + br.close(); + } + private void serializeRDB(RocksDB rdb) throws IOException { // DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's // named "iterator" diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java index d2ce12dee8..066b471ddc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java @@ -16,6 +16,8 @@ package org.opencb.cellbase.lib.builders.clinical.variant; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.collections4.MapUtils; import org.opencb.biodata.models.variant.Variant; @@ -23,14 +25,18 @@ import org.opencb.biodata.models.variant.avro.*; import org.opencb.biodata.models.variant.metadata.VariantStudyMetadata; import org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader; +import org.opencb.cellbase.core.models.DataReleaseSource; import org.opencb.cellbase.lib.EtlCommons; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; import java.util.*; +import static org.opencb.cellbase.lib.EtlCommons.HGMD_VERSION_FILENAME; + /** * Created by jtarraga on 23/02/22. */ @@ -38,6 +44,9 @@ public class HGMDIndexer extends ClinicalIndexer { private final Path hgmdFile; private final String assembly; + private String date; + private String version; + public HGMDIndexer(Path hgmdFile, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException { super(genomeSequenceFilePath); @@ -51,6 +60,18 @@ public void index() throws RocksDBException, IOException { logger.info("Parsing HGMD file ..."); try { + + Path hgmdVersionPath = hgmdFile.getParent().resolve(HGMD_VERSION_FILENAME); + if (!Files.exists(hgmdVersionPath)) { + throw new IOException("HGMD version file " + hgmdVersionPath + " does not exist"); + } + ObjectMapper jsonObjectMapper = new ObjectMapper(); + ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataReleaseSource.class); + DataReleaseSource dataReleaseSource = jsonObjectReader.readValue(hgmdVersionPath.toFile()); + + this.date = dataReleaseSource.getDate(); + this.version = dataReleaseSource.getVersion(); + VariantStudyMetadata metadata = new VariantFileMetadata(null, hgmdFile.toString()).toVariantStudyMetadata("study"); VariantVcfHtsjdkReader reader = new VariantVcfHtsjdkReader(hgmdFile.toAbsolutePath(), metadata); for (Variant variant : reader) { @@ -74,7 +95,6 @@ public void index() throws RocksDBException, IOException { throw e; } finally { logger.info("Done"); - // this.printSummary(); } } @@ -93,7 +113,7 @@ private void parseHgmdInfo(Variant variant) { } // Source - entry.setSource(new EvidenceSource(EtlCommons.HGMD_DATA, "2020.3", "2020")); + entry.setSource(new EvidenceSource(EtlCommons.HGMD_DATA, version, date)); // Assembly entry.setAssembly(assembly); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index bb9e0c36e4..6dd8050662 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -20,12 +20,12 @@ import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.commons.utils.FileUtils; import javax.ws.rs.client.Client; import javax.ws.rs.client.ClientBuilder; import javax.ws.rs.client.WebTarget; -import java.io.*; +import java.io.BufferedWriter; +import java.io.IOException; import java.net.URI; import java.nio.file.Files; import java.nio.file.Path; @@ -34,6 +34,8 @@ import java.util.List; import java.util.Map; +import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_VERSION_FILENAME; + public class ClinicalDownloadManager extends AbstractDownloadManager { private static final String CLINVAR_NAME = "ClinVar"; @@ -82,7 +84,7 @@ public List downloadClinical() throws IOException, InterruptedExce downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE).toString())); clinvarUrls.add(url); saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, configuration.getDownload().getClinvar() - .getVersion(), getTimeStamp(), clinvarUrls, clinicalFolder.resolve("clinvarVersion.json")); + .getVersion(), getTimeStamp(), clinvarUrls, clinicalFolder.resolve(CLINVAR_VERSION_FILENAME)); logger.info("\t\tDone"); @@ -137,58 +139,11 @@ public List downloadClinical() throws IOException, InterruptedExce // Collections.singletonList(url), clinicalFolder.resolve("iarctp53Version.json")); // } - if (Files.notExists(clinicalFolder.resolve("clinvar_chunks"))) { - Files.createDirectories(clinicalFolder.resolve("clinvar_chunks")); - splitClinvar(clinicalFolder.resolve(EtlCommons.CLINVAR_XML_FILE), clinicalFolder.resolve("clinvar_chunks")); - } - return downloadFiles; } return null; } - private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException { - BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath); - PrintWriter pw = null; - StringBuilder header = new StringBuilder(); - boolean beforeEntry = true; - boolean inEntry = false; - int count = 0; - int chunk = 0; - String line; - while ((line = br.readLine()) != null) { - if (line.trim().startsWith("")) { - inEntry = false; - if (count % 10000 == 0) { - pw.print(""); - pw.close(); - chunk++; - } - } - } - pw.print(""); - pw.close(); - br.close(); - } - private String getDocmVersion(Path docmIndexHtml) { return getVersionFromVersionLine(docmIndexHtml, "