2024
Hämmerle, Michelle; Guellil, Meriam; Cheronet, Olivia; Sawyer, Susanna; Ruiz-Gartzia, Irune; Lizano, Esther; Rymbekova, Aigerim; Gelabert, Pere; Bernardi, Paolo; Han, Sojung; Trgovec-Greif, Lovro; Rattei, Thomas; Schuenemann, Verena J.; Marques-Bonet, Tomas; Guschanski, Katerina; Calvignac-Spencer, Sebastien; Pinhasi, Ron; Kuhlwilm, Martin
Screening great ape museum specimens for DNA viruses Journal Article
In: bioRxiv, 2024.
Abstract | Links | BibTeX | Tags: Project 08
@article{nokey,
title = {Screening great ape museum specimens for DNA viruses},
author = {Michelle Hämmerle and Meriam Guellil and Olivia Cheronet and Susanna Sawyer and Irune Ruiz-Gartzia and Esther Lizano and Aigerim Rymbekova and Pere Gelabert and Paolo Bernardi and Sojung Han and Lovro Trgovec-Greif and Thomas Rattei and Verena J. Schuenemann and Tomas Marques-Bonet and Katerina Guschanski and Sebastien Calvignac-Spencer and Ron Pinhasi and Martin Kuhlwilm},
doi = {10.1101/2024.04.25.591107v3},
year = {2024},
date = {2024-11-01},
journal = {bioRxiv},
abstract = {Natural history museum collections harbour a record of wild species from the past centuries, providing a unique opportunity to study animals as well as their infectious agents. Thousands of great ape specimens are kept in these collections, and could become an important resource for studying the evolution of DNA viruses. Their genetic material is likely to be preserved in dry museum specimens, as reported previously for monkeypox virus genomes from historical orangutan specimens. Here, we screened 209 great ape museum specimens for 99 different DNA viruses, using hybridization capture coupled with short-read high-throughput sequencing. We determined the presence of multiple viruses within this dataset from historical specimens and obtained several near-complete viral genomes. In particular, we report high-coverage (>18-fold) hepatitis B virus genomes from one gorilla and two chimpanzee individuals, which are phylogenetically placed within clades infecting the respective host species.},
keywords = {Project 08},
pubstate = {published},
tppubtype = {article}
}
van der Toorn, Wiep; Bohn, Patrick; Liu-Wei, Wang; Olguin-Nava, Marco; Smyth, Redmond P; von Kleist, Max
Demultiplexing and barcode-specific adaptive sampling for nanopore direct RNA sequencing Journal Article
In: bioRxiv, 2024.
Abstract | Links | BibTeX | Tags: Project 08
@article{vanderToorn2024,
title = {Demultiplexing and barcode-specific adaptive sampling for nanopore direct RNA sequencing},
author = {Wiep van der Toorn and Patrick Bohn and Wang Liu-Wei and Marco Olguin-Nava and Redmond P Smyth and Max von Kleist},
doi = {10.1101/2024.07.22.604276v2},
year = {2024},
date = {2024-10-23},
journal = {bioRxiv},
abstract = {Nanopore direct RNA sequencing (dRNA-seq) enables unique insights into (epi-)transcriptomics. However, applications are currently limited by the lack of accurate and cost-effective sample multiplexing. We introduce WarpDemuX, an ultra-fast and highly accurate adapter-barcoding and demultiplexing approach. WarpDemuX enhances speed and accuracy by fast processing of the raw nanopore signal, use of a light-weight machine-learning algorithm and design of optimized barcode sets. We demonstrate its utility by performing a rapid phenotypic profiling of different SARS-CoV-2 viruses, crucial for pandemic prevention and response, through multiplexed sequencing of longitudinal samples on a single flowcell. This identifies systematic differences in transcript abundance and poly(A) tail lengths during infection. Additionally, integrating WarpDemuX into sequencing control software enables real-time enrichment of target molecules through barcode-specific adaptive sampling, which we demonstrate by enriching low abundance viral RNA. In summary, WarpDemuX is a broadly applicable, high-performance, and economical multiplexing solution for nanopore dRNA-seq, facilitating advanced (epi-)transcriptomic research.},
keywords = {Project 08},
pubstate = {published},
tppubtype = {article}
}
Liu-Wei, Wang; van der Toorn, Wiep; Bohn, Patrick; Hölzer, Martin; Smyth, Redmond P.; von Kleist, Max
Sequencing accuracy and systematic errors of nanopore direct RNA sequencing Journal Article
In: BMC Genomics, vol. 25, pp. 528, 2024.
Abstract | Links | BibTeX | Tags: Project 08
@article{Liu-Wei2024,
title = {Sequencing accuracy and systematic errors of nanopore direct RNA sequencing},
author = {Wang Liu-Wei and Wiep van der Toorn and Patrick Bohn and Martin Hölzer and Redmond P. Smyth and Max von Kleist},
doi = {10.1186/s12864-024-10440-w},
year = {2024},
date = {2024-05-28},
journal = {BMC Genomics},
volume = {25},
pages = {528},
abstract = {Background
Direct RNA sequencing (dRNA-seq) on the Oxford Nanopore Technologies (ONT) platforms can produce reads covering up to full-length gene transcripts, while containing decipherable information about RNA base modifications and poly-A tail lengths. Although many published studies have been expanding the potential of dRNA-seq, its sequencing accuracy and error patterns remain understudied.
Results
We present the first comprehensive evaluation of sequencing accuracy and characterisation of systematic errors in dRNA-seq data from diverse organisms and synthetic in vitro transcribed RNAs. We found that for sequencing kits SQK-RNA001 and SQK-RNA002, the median read accuracy ranged from 87% to 92% across species, and deletions significantly outnumbered mismatches and insertions. Due to their high abundance in the transcriptome, heteropolymers and short homopolymers were the major contributors to the overall sequencing errors. We also observed systematic biases across all species at the levels of single nucleotides and motifs. In general, cytosine/uracil-rich regions were more likely to be erroneous than guanines and adenines. By examining raw signal data, we identified the underlying signal-level features potentially associated with the error patterns and their dependency on sequence contexts. While read quality scores can be used to approximate error rates at base and read levels, failure to detect DNA adapters may be a source of errors and data loss. By comparing distinct basecallers, we reason that some sequencing errors are attributable to signal insufficiency rather than algorithmic (basecalling) artefacts. Lastly, we generated dRNA-seq data using the latest SQK-RNA004 sequencing kit released at the end of 2023 and found that although the overall read accuracy increased, the systematic errors remain largely identical compared to the previous kits.
Conclusions
As the first systematic investigation of dRNA-seq errors, this study offers a comprehensive overview of reproducible error patterns across diverse datasets, identifies potential signal-level insufficiency, and lays the foundation for error correction methods.},
keywords = {Project 08},
pubstate = {published},
tppubtype = {article}
}
Direct RNA sequencing (dRNA-seq) on the Oxford Nanopore Technologies (ONT) platforms can produce reads covering up to full-length gene transcripts, while containing decipherable information about RNA base modifications and poly-A tail lengths. Although many published studies have been expanding the potential of dRNA-seq, its sequencing accuracy and error patterns remain understudied.
Results
We present the first comprehensive evaluation of sequencing accuracy and characterisation of systematic errors in dRNA-seq data from diverse organisms and synthetic in vitro transcribed RNAs. We found that for sequencing kits SQK-RNA001 and SQK-RNA002, the median read accuracy ranged from 87% to 92% across species, and deletions significantly outnumbered mismatches and insertions. Due to their high abundance in the transcriptome, heteropolymers and short homopolymers were the major contributors to the overall sequencing errors. We also observed systematic biases across all species at the levels of single nucleotides and motifs. In general, cytosine/uracil-rich regions were more likely to be erroneous than guanines and adenines. By examining raw signal data, we identified the underlying signal-level features potentially associated with the error patterns and their dependency on sequence contexts. While read quality scores can be used to approximate error rates at base and read levels, failure to detect DNA adapters may be a source of errors and data loss. By comparing distinct basecallers, we reason that some sequencing errors are attributable to signal insufficiency rather than algorithmic (basecalling) artefacts. Lastly, we generated dRNA-seq data using the latest SQK-RNA004 sequencing kit released at the end of 2023 and found that although the overall read accuracy increased, the systematic errors remain largely identical compared to the previous kits.
Conclusions
As the first systematic investigation of dRNA-seq errors, this study offers a comprehensive overview of reproducible error patterns across diverse datasets, identifies potential signal-level insufficiency, and lays the foundation for error correction methods.
2022
Liu-Wei, Wang; Aubrey, Wayne; Clare, Amanda; Hoehndorf, Robert; Creevey, Christopher J.; Dimonaco, Nicholas J.
FrameRate: learning the coding potential of unassembled metagenomic reads Journal Article
In: bioRxiv, 2022.
Abstract | Links | BibTeX | Tags: Project 08, WP 2.1 Microevolution: Virus quasispecies
@article{Liu-Wei2022c,
title = {FrameRate: learning the coding potential of unassembled metagenomic reads},
author = {Wang Liu-Wei and Wayne Aubrey and Amanda Clare and Robert Hoehndorf and Christopher J. Creevey and Nicholas J. Dimonaco},
doi = {10.1101/2022.09.16.508314},
year = {2022},
date = {2022-09-19},
journal = {bioRxiv},
abstract = {Motivation Metagenomic assembly is a slow and computationally intensive process and despite needing iterative rounds for improvement and completeness the resulting assembly often fails to incorporate many of the input sequencing reads. This is further complicated when there is reduced read-depth and/or artefacts which result in chimeric assemblies both of which are especially prominent in the assembly of metagenomic datasets. Many of these limitations could potentially be overcome by exploiting the information content stored in the reads directly and thus eliminating the need for assembly in a number of situations.
Results We explored the prediction of coding potential of DNA reads by training a machine learning model on existing protein sequences. Named ‘FrameRate’, this model can predict the coding frame(s) from unassembled DNA sequencing reads directly, thus greatly reducing the computational resources required for genome assembly and similarity-based inference to pre-computed databases. Using the eggNOG-mapper function annotation tool, the predicted coding frames from FrameRate were functionally verified by comparing to the results from full-length protein sequences reconstructed with an established metagenome assembly and gene prediction pipeline from the same metagenomic sample. FrameRate captured equivalent functional profiles from the coding frames while reducing the required storage and time resources significantly. FrameRate was also able to annotate reads that were not represented in the assembly, capturing this ‘missing’ information. As an ultra-fast read-level assembly-free coding profiler, FrameRate enables rapid characterisation of almost every sequencing read directly, whether it can be assembled or not, and thus circumvent many of the problems caused by contemporary assembly workflows.},
keywords = {Project 08, WP 2.1 Microevolution: Virus quasispecies},
pubstate = {published},
tppubtype = {article}
}
Results We explored the prediction of coding potential of DNA reads by training a machine learning model on existing protein sequences. Named ‘FrameRate’, this model can predict the coding frame(s) from unassembled DNA sequencing reads directly, thus greatly reducing the computational resources required for genome assembly and similarity-based inference to pre-computed databases. Using the eggNOG-mapper function annotation tool, the predicted coding frames from FrameRate were functionally verified by comparing to the results from full-length protein sequences reconstructed with an established metagenome assembly and gene prediction pipeline from the same metagenomic sample. FrameRate captured equivalent functional profiles from the coding frames while reducing the required storage and time resources significantly. FrameRate was also able to annotate reads that were not represented in the assembly, capturing this ‘missing’ information. As an ultra-fast read-level assembly-free coding profiler, FrameRate enables rapid characterisation of almost every sequencing read directly, whether it can be assembled or not, and thus circumvent many of the problems caused by contemporary assembly workflows.
2021
Goettsch, Winfried; Beerenwinkel, Niko; Deng, Li; Dölken, Lars; Dutilh, Bas E.; Erhard, Florian; Kaderali, Lars; von Kleist, Max; Marquet, Roland; Matthijnssens, Jelle; McCallin, Shawna; McMahon, Dino; Rattei, Thomas; van Rij, Ronald P.; Robertson, David L.; Schwemmle, Martin; Stern-Ginossar, Noam; Marz, Manja
ITN -- VIROINF: Understanding (Harmful) Virus-Host Interactions by Linking Virology and Bioinformatics Journal Article
In: Viruses, vol. 13, no. 5, pp. 766, 2021.
Abstract | Links | BibTeX | Tags: Project 01, Project 02, Project 03, Project 04, Project 05, Project 06, Project 07, Project 08, Project 09, Project 10, Project 11, Project 12, Project 13, Project 14, Project 15, WP 1.1 Virus identification, WP 1.2 Host prediction, WP 1.3 Virus-host interactions, WP 1.4 Virus regulation, WP 1.5 Virus products, WP 2.1 Microevolution: Virus quasispecies, WP 2.2 Macroevolution: Natural selection of viruses
@article{nokey,
title = {ITN -- VIROINF: Understanding (Harmful) Virus-Host Interactions by Linking Virology and Bioinformatics},
author = {Winfried Goettsch and Niko Beerenwinkel and Li Deng and Lars Dölken and Bas E. Dutilh and Florian Erhard and Lars Kaderali and Max von Kleist and Roland Marquet and Jelle Matthijnssens and Shawna McCallin and Dino McMahon and Thomas Rattei and Ronald P. {van Rij} and David L. Robertson and Martin Schwemmle and Noam Stern-Ginossar and Manja Marz},
doi = {10.3390/v13050766},
year = {2021},
date = {2021-04-27},
urldate = {2021-04-27},
journal = {Viruses},
volume = {13},
number = {5},
pages = {766},
abstract = {Many recent studies highlight the fundamental importance of viruses. Besides their important role as human and animal pathogens, their beneficial, commensal or harmful functions are poorly understood. By developing and applying tailored bioinformatical tools in important virological models, the Marie Skłodowska-Curie Initiative International Training Network VIROINF will provide a better understanding of viruses and the interaction with their hosts. This will open the door to validate methods of improving viral growth, morphogenesis and development, as well as to control strategies against unwanted microorganisms. The key feature of VIROINF is its interdisciplinary nature, which brings together virologists and bioinformaticians to achieve common goals.},
keywords = {Project 01, Project 02, Project 03, Project 04, Project 05, Project 06, Project 07, Project 08, Project 09, Project 10, Project 11, Project 12, Project 13, Project 14, Project 15, WP 1.1 Virus identification, WP 1.2 Host prediction, WP 1.3 Virus-host interactions, WP 1.4 Virus regulation, WP 1.5 Virus products, WP 2.1 Microevolution: Virus quasispecies, WP 2.2 Macroevolution: Natural selection of viruses},
pubstate = {published},
tppubtype = {article}
}