2024
Fuhrmann, Lara; Jablonski, Kim Philipp; Topolsky, Ivan; Batavia, Aashil A; Borgsmüller, Nico; Baykal, Pelin Icer; Carrara, Matteo; Chen, Chaoran; Dondi, Arthur; Dragan, Monica; Dreifuss, David; John, Anika; Langer, Benjamin; Okoniewski, Michal; du Plessis, Louis; Schmitt, Uwe; Singer, Franziska; Stadler, Tanja; Beerenwinkel, Niko
V-pipe 3.0: a sustainable pipeline for within-sample viral genetic diversity estimation Journal Article
In: GigaScience, vol. 13, pp. giae065, 2024.
Abstract | Links | BibTeX | Tags: Project 03, WP 1.3 Virus-host interactions, WP 2.1 Microevolution: Virus quasispecies
@article{Fuhrmann2024b,
title = {V-pipe 3.0: a sustainable pipeline for within-sample viral genetic diversity estimation},
author = {Lara Fuhrmann and Kim Philipp Jablonski and Ivan Topolsky and Aashil A Batavia and Nico Borgsmüller and Pelin Icer Baykal and Matteo Carrara and Chaoran Chen and Arthur Dondi and Monica Dragan and David Dreifuss and Anika John and Benjamin Langer and Michal Okoniewski and Louis du Plessis and Uwe Schmitt and Franziska Singer and Tanja Stadler and Niko Beerenwinkel},
doi = {10.1093/gigascience/giae065},
year = {2024},
date = {2024-09-30},
urldate = {2024-09-30},
journal = {GigaScience},
volume = {13},
pages = {giae065},
abstract = {The large amount and diversity of viral genomic datasets generated by next-generation sequencing technologies poses a set of challenges for computational data analysis workflows, including rigorous quality control, scaling to large sample sizes, and tailored steps for specific applications. Here, we present V-pipe 3.0, a computational pipeline designed for analyzing next-generation sequencing data of short viral genomes. It is developed to enable reproducible, scalable, adaptable, and transparent inference of genetic diversity of viral samples. By presenting 2 large-scale data analysis projects, we demonstrate the effectiveness of V-pipe 3.0 in supporting sustainable viral genomic data science.},
keywords = {Project 03, WP 1.3 Virus-host interactions, WP 2.1 Microevolution: Virus quasispecies},
pubstate = {published},
tppubtype = {article}
}
Fuhrmann, Lara; Langer, Benjamin; Topolsky, Ivan; Beerenwinkel, Niko
VILOCA: Sequencing quality-aware haplotype reconstruction and mutation calling for short- and long-read data Journal Article
In: bioRxiv, 2024.
Abstract | Links | BibTeX | Tags: Project 03, WP 1.3 Virus-host interactions, WP 2.1 Microevolution: Virus quasispecies
@article{Fuhrmann2024,
title = {VILOCA: Sequencing quality-aware haplotype reconstruction and mutation calling for short- and long-read data},
author = {Lara Fuhrmann and Benjamin Langer and Ivan Topolsky and Niko Beerenwinkel},
doi = {10.1101/2024.06.06.597712},
year = {2024},
date = {2024-06-09},
journal = {bioRxiv},
abstract = {RNA viruses exist in large heterogeneous populations within their host. The structure and diversity of virus populations affects disease progression and treatment outcomes. Next-generation sequencing allows detailed viral population analysis, but inferring diversity from error-prone reads is challenging. Here, we present VILOCA, a method for mutation calling and reconstruction of local haplotypes from short- and long-read viral sequencing data. Local haplotypes refer to genomic regions that have approximately the length of the input reads. VILOCA recovers local haplotypes by using a Dirichlet process mixture model to cluster reads around their unobserved haplotypes and leveraging quality scores of the sequencing reads. We assessed the performance of VILOCA in terms of mutation calling and haplotype reconstruction accuracy on simulated and experimental Illumina, PacBio, and Oxford Nanopore data. On simulated and experimental Illumina data, VILOCA performed better or similar to existing methods. On the simulated long-read data, VILOCA is able to recover on average 82% of the ground truth mutations with perfect precision compared to only 64% recall and 90% precision of the second-best method. In summary, VILOCA provides significantly improved accuracy in mutation and haplotype calling, especially for long-read sequencing data, and therefore facilitates the comprehensive characterization of heterogeneous within-host viral populations.},
keywords = {Project 03, WP 1.3 Virus-host interactions, WP 2.1 Microevolution: Virus quasispecies},
pubstate = {published},
tppubtype = {article}
}
2022
Liu-Wei, Wang; Aubrey, Wayne; Clare, Amanda; Hoehndorf, Robert; Creevey, Christopher J.; Dimonaco, Nicholas J.
FrameRate: learning the coding potential of unassembled metagenomic reads Journal Article
In: bioRxiv, 2022.
Abstract | Links | BibTeX | Tags: Project 08, WP 2.1 Microevolution: Virus quasispecies
@article{Liu-Wei2022c,
title = {FrameRate: learning the coding potential of unassembled metagenomic reads},
author = {Wang Liu-Wei and Wayne Aubrey and Amanda Clare and Robert Hoehndorf and Christopher J. Creevey and Nicholas J. Dimonaco},
doi = {10.1101/2022.09.16.508314},
year = {2022},
date = {2022-09-19},
journal = {bioRxiv},
abstract = {Motivation Metagenomic assembly is a slow and computationally intensive process and despite needing iterative rounds for improvement and completeness the resulting assembly often fails to incorporate many of the input sequencing reads. This is further complicated when there is reduced read-depth and/or artefacts which result in chimeric assemblies both of which are especially prominent in the assembly of metagenomic datasets. Many of these limitations could potentially be overcome by exploiting the information content stored in the reads directly and thus eliminating the need for assembly in a number of situations.
Results We explored the prediction of coding potential of DNA reads by training a machine learning model on existing protein sequences. Named ‘FrameRate’, this model can predict the coding frame(s) from unassembled DNA sequencing reads directly, thus greatly reducing the computational resources required for genome assembly and similarity-based inference to pre-computed databases. Using the eggNOG-mapper function annotation tool, the predicted coding frames from FrameRate were functionally verified by comparing to the results from full-length protein sequences reconstructed with an established metagenome assembly and gene prediction pipeline from the same metagenomic sample. FrameRate captured equivalent functional profiles from the coding frames while reducing the required storage and time resources significantly. FrameRate was also able to annotate reads that were not represented in the assembly, capturing this ‘missing’ information. As an ultra-fast read-level assembly-free coding profiler, FrameRate enables rapid characterisation of almost every sequencing read directly, whether it can be assembled or not, and thus circumvent many of the problems caused by contemporary assembly workflows.},
keywords = {Project 08, WP 2.1 Microevolution: Virus quasispecies},
pubstate = {published},
tppubtype = {article}
}
Results We explored the prediction of coding potential of DNA reads by training a machine learning model on existing protein sequences. Named ‘FrameRate’, this model can predict the coding frame(s) from unassembled DNA sequencing reads directly, thus greatly reducing the computational resources required for genome assembly and similarity-based inference to pre-computed databases. Using the eggNOG-mapper function annotation tool, the predicted coding frames from FrameRate were functionally verified by comparing to the results from full-length protein sequences reconstructed with an established metagenome assembly and gene prediction pipeline from the same metagenomic sample. FrameRate captured equivalent functional profiles from the coding frames while reducing the required storage and time resources significantly. FrameRate was also able to annotate reads that were not represented in the assembly, capturing this ‘missing’ information. As an ultra-fast read-level assembly-free coding profiler, FrameRate enables rapid characterisation of almost every sequencing read directly, whether it can be assembled or not, and thus circumvent many of the problems caused by contemporary assembly workflows.
2021
Goettsch, Winfried; Beerenwinkel, Niko; Deng, Li; Dölken, Lars; Dutilh, Bas E.; Erhard, Florian; Kaderali, Lars; von Kleist, Max; Marquet, Roland; Matthijnssens, Jelle; McCallin, Shawna; McMahon, Dino; Rattei, Thomas; van Rij, Ronald P.; Robertson, David L.; Schwemmle, Martin; Stern-Ginossar, Noam; Marz, Manja
ITN -- VIROINF: Understanding (Harmful) Virus-Host Interactions by Linking Virology and Bioinformatics Journal Article
In: Viruses, vol. 13, no. 5, pp. 766, 2021.
Abstract | Links | BibTeX | Tags: Project 01, Project 02, Project 03, Project 04, Project 05, Project 06, Project 07, Project 08, Project 09, Project 10, Project 11, Project 12, Project 13, Project 14, Project 15, WP 1.1 Virus identification, WP 1.2 Host prediction, WP 1.3 Virus-host interactions, WP 1.4 Virus regulation, WP 1.5 Virus products, WP 2.1 Microevolution: Virus quasispecies, WP 2.2 Macroevolution: Natural selection of viruses
@article{nokey,
title = {ITN -- VIROINF: Understanding (Harmful) Virus-Host Interactions by Linking Virology and Bioinformatics},
author = {Winfried Goettsch and Niko Beerenwinkel and Li Deng and Lars Dölken and Bas E. Dutilh and Florian Erhard and Lars Kaderali and Max von Kleist and Roland Marquet and Jelle Matthijnssens and Shawna McCallin and Dino McMahon and Thomas Rattei and Ronald P. {van Rij} and David L. Robertson and Martin Schwemmle and Noam Stern-Ginossar and Manja Marz},
doi = {10.3390/v13050766},
year = {2021},
date = {2021-04-27},
urldate = {2021-04-27},
journal = {Viruses},
volume = {13},
number = {5},
pages = {766},
abstract = {Many recent studies highlight the fundamental importance of viruses. Besides their important role as human and animal pathogens, their beneficial, commensal or harmful functions are poorly understood. By developing and applying tailored bioinformatical tools in important virological models, the Marie Skłodowska-Curie Initiative International Training Network VIROINF will provide a better understanding of viruses and the interaction with their hosts. This will open the door to validate methods of improving viral growth, morphogenesis and development, as well as to control strategies against unwanted microorganisms. The key feature of VIROINF is its interdisciplinary nature, which brings together virologists and bioinformaticians to achieve common goals.},
keywords = {Project 01, Project 02, Project 03, Project 04, Project 05, Project 06, Project 07, Project 08, Project 09, Project 10, Project 11, Project 12, Project 13, Project 14, Project 15, WP 1.1 Virus identification, WP 1.2 Host prediction, WP 1.3 Virus-host interactions, WP 1.4 Virus regulation, WP 1.5 Virus products, WP 2.1 Microevolution: Virus quasispecies, WP 2.2 Macroevolution: Natural selection of viruses},
pubstate = {published},
tppubtype = {article}
}