diff --git a/GRANTS.yaml b/GRANTS.yaml
index f732e3f68f6a51..f12061b5eb8be0 100644
--- a/GRANTS.yaml
+++ b/GRANTS.yaml
@@ -192,6 +192,13 @@ h2020-defend:
This work has received funding from the DEFEND project (www.defend2020.eu) with funding from the European Union's Horizon 2020 research and innovation programme under grant agreement No 773701.
url: https://www.defend2020.eu
+metabohub:
+ name: MetaboHUB
+ github: false
+ joined: 2025-07
+ url: https://www.metabohub.fr/
+ avatar: "/training-material/shared/images/metabohub.fr"
+
nfdi4bioimage:
name: NFDI4Bioimage
short_name: "NFDI4Bioimage"
@@ -254,7 +261,15 @@ oscars:
url: https://oscars-project.eu/
github: false
avatar: "/training-material/shared/images/logos/OSCARS-logo-EUflag.png"
-
+
+rfmf:
+ name: Réseau Francophone de Metabolomique et Fluxomique
+ short_name: RFMF
+ github: false
+ joined: 2025-07
+ url: https://www.rfmf.fr/
+ avatar: "/training-material/shared/images/rfmf.fr"
+
skills4eosc:
name: Skills for European Open Science Cloud
short_name: "Skills4SEOSC"
diff --git a/shared/images/metabohub.png b/shared/images/metabohub.png
new file mode 100644
index 00000000000000..646d23f9345dc4
Binary files /dev/null and b/shared/images/metabohub.png differ
diff --git a/shared/images/rfmf.png b/shared/images/rfmf.png
new file mode 100644
index 00000000000000..dc63311fa7a1d2
Binary files /dev/null and b/shared/images/rfmf.png differ
diff --git a/topics/metabolomics/tutorials/gcms/tutorial.bib b/topics/metabolomics/tutorials/gcms/tutorial.bib
index 250d7376458b42..c706b881fd1a1b 100644
--- a/topics/metabolomics/tutorials/gcms/tutorial.bib
+++ b/topics/metabolomics/tutorials/gcms/tutorial.bib
@@ -24,48 +24,39 @@ @article{Wehrens2014
journal = {Journal of Chromatography B}
}
-@article{Stein1994,
- doi = {10.1016/1044-0305(94)87009-8},
- url = {https://doi.org/10.1016/1044-0305(94)87009-8},
- year = {1994},
- month = aug,
- publisher = {Journal of the American Society for Mass Spectrometry},
- volume = {5},
- number = {9},
- pages = {859-866},
- author = {Stephen E. Stein and Donald R. Scott},
- title = {Optimization and testing of mass spectral library search algorithms for compound identification},
- journal = {Journal of the American Society for Mass Spectrometry}
-}
-
-@article{Stein1999,
- doi = {10.1016/S1044-0305(99)00047-1},
- url = {https://doi.org/10.1016/S1044-0305(99)00047-1},
- year = {1999},
- month = aug,
- publisher = {Journal of the American Society for Mass Spectrometry},
- volume = {10},
- pages = {770-781},
- author = {Stephen E. Stein},
- title = {An integrated method for spectrum extraction and compound identification from gas chromatography/mass spectrometry data},
- journal = {Journal of the American Society for Mass Spectrometry}
+@article{Stanstrup2019,
+author = {Stanstrup, Jan and Broeckling, Corey and Helmus, Rick and Hoffmann, Nils and Math{\'{e}}, Ewy and Naake, Thomas and Nicolotti, Luca and Peters, Kristian and Rainer, Johannes and Salek, Reza and Schulze, Tobias and Schymanski, Emma and Stravs, Michael and Th{\'{e}}venot, Etienne and Treutler, Hendrik and Weber, Ralf and Willighagen, Egon and Witting, Michael and Neumann, Steffen},
+doi = {10.3390/metabo9100200},
+issn = {2218-1989},
+journal = {Metabolites},
+keywords = {Bioconductor,CRAN,Compound identification,Data integration,Feature selection,Lipidomics,Mass spectrometry,Metabolite networks,Metabolomics,NMR spectroscopy,R,Signal processing,Statistical data analysis},
+month = {sep},
+number = {10},
+pages = {200},
+title = {{The metaRbolomics Toolbox in Bioconductor and beyond}},
+url = {https://www.mdpi.com/2218-1989/9/10/200},
+volume = {9},
+year = {2019}
}
-@article{Thvenot2015,
- doi = {10.1021/acs.jproteome.5b00354},
- url = {https://doi.org/10.1021/acs.jproteome.5b00354},
- year = {2015},
- month = jul,
- publisher = {American Chemical Society ({ACS})},
- volume = {14},
- number = {8},
- pages = {3322--3335},
- author = {Etienne A. Th{\'{e}}venot and Aur{\'{e}}lie Roux and Ying Xu and Eric Ezan and Christophe Junot},
- title = {Analysis of the Human Adult Urinary Metabolome Variations with Age, Body Mass Index, and Gender by Implementing a Comprehensive Workflow for Univariate and {OPLS} Statistical Analyses},
- journal = {Journal of Proteome Research}
+@article{Misra2021,
+author = {Misra, Biswapriya B.},
+doi = {10.1007/s11306-021-01796-1},
+isbn = {0123456789},
+issn = {1573-3882},
+journal = {Metabolomics},
+keywords = {Annotation,Database,In silico,Metabolite,Metabolomics,Program,Recourse,Software,Tool},
+month = {may},
+number = {5},
+pages = {49},
+pmid = {33977389},
+publisher = {Springer US},
+title = {{New software tools, databases, and resources in metabolomics: updates from 2020}},
+url = {https://doi.org/10.1007/s11306-021-01796-1 https://link.springer.com/10.1007/s11306-021-01796-1},
+volume = {17},
+year = {2021}
}
-
@article{Giacomoni2014,
doi = {10.1093/bioinformatics/btu813},
url = {https://doi.org/10.1093/bioinformatics/btu813},
@@ -80,7 +71,6 @@ @article{Giacomoni2014
journal = {Bioinformatics}
}
-
@article{Guitton2017,
doi = {10.1016/j.biocel.2017.07.002},
url = {https://doi.org/10.1016/j.biocel.2017.07.002},
@@ -117,21 +107,6 @@ @misc{CAMERA
year = {2017}
}
-@article{Tautenhahn2008,
- author="Tautenhahn, Ralf and B{\"o}ttcher, Christoph and Neumann, Steffen",
- title="Highly sensitive feature detection for high resolution LC/MS",
- journal="BMC Bioinformatics",
- year="2008",
- month="Nov",
- day="28",
- volume="9",
- number="1",
- pages="504",
- abstract="Liquid chromatography coupled to mass spectrometry (LC/MS) is an important analytical technology for e.g. metabolomics experiments. Determining the boundaries, centres and intensities of the two-dimensional signals in the LC/MS raw data is called feature detection. For the subsequent analysis of complex samples such as plant extracts, which may contain hundreds of compounds, corresponding to thousands of features -- a reliable feature detection is mandatory.",issn="1471-2105",
- doi="10.1186/1471-2105-9-504",
- url="https://doi.org/10.1186/1471-2105-9-504"
-}
-
@article{Kopka2005,
doi = {10.1093/bioinformatics/bti236},
url = {http://gmd.mpimp-golm.mpg.de/},
@@ -144,14 +119,56 @@ @CSB.DB:
journal = {Bioinformatics}
}
+@article{gatto2020msnbase,
+ title={MSnbase, efficient and elegant R-based processing and visualization of raw mass spectrometry data},
+ author={Gatto, Laurent and Gibb, Sebastian and Rainer, Johannes},
+ journal={Journal of Proteome Research},
+ volume={20},
+ number={1},
+ pages={1063--1069},
+ year={2020},
+ publisher={ACS Publications},
+ doi={10.1021/acs.jproteome.0c00313}
+}
+
+@article{Stein1994,
+ doi = {10.1016/1044-0305(94)87009-8},
+ url = {https://doi.org/10.1016/1044-0305(94)87009-8},
+ year = {1994},
+ month = aug,
+ publisher = {Journal of the American Society for Mass Spectrometry},
+ volume = {5},
+ number = {9},
+ pages = {859-866},
+ author = {Stephen E. Stein and Donald R. Scott},
+ title = {Optimization and testing of mass spectral library search algorithms for compound identification},
+ journal = {Journal of the American Society for Mass Spectrometry}
+}
+
+@article{Stein1999,
+ doi = {10.1016/S1044-0305(99)00047-1},
+ url = {https://doi.org/10.1016/S1044-0305(99)00047-1},
+ year = {1999},
+ month = aug,
+ publisher = {Journal of the American Society for Mass Spectrometry},
+ volume = {10},
+ pages = {770-781},
+ author = {Stephen E. Stein},
+ title = {An integrated method for spectrum extraction and compound identification from gas chromatography/mass spectrometry data},
+ journal = {Journal of the American Society for Mass Spectrometry}
+}
+
@article{Horai2010,
- author = {Horai Hisayuki and Arita Masanori and Kanaya Shigehiko and Nihei Yoshito and Ikeda Tasuku and Suwa Kazuhiro and Ojima Yuya and Tanaka Kenichi and Tanaka Satoshi and Aoshima Ken and Oda Yoshiya and Kakazu Yuji and Kusano Miyako and Tohge Takayuki and Matsuda Fumio and Sawada Yuji and Hirai Masami Yokota and Nakanishi Hiroki and Ikeda Kazutaka and Akimoto Naoshige and Maoka Takashi and Takahashi Hiroki and Ara Takeshi and Sakurai Nozomu and Suzuki Hideyuki and Shibata Daisuke and Neumann Steffen and Iida Takashi and Tanaka Ken and Funatsu Kimito and Matsuura Fumito and Soga Tomoyoshi and Taguchi Ryo and Saito Kazuki and Nishioka Takaaki},
+ author = {Horai, Hisayuki and Arita, Masanori and Kanaya, Shigehiko and Nihei, Yoshito and Ikeda, Tasuku and Suwa, Kazuhiro and Ojima, Yuya and Tanaka, Kenichi and Tanaka, Satoshi and Aoshima, Ken and Oda, Yoshiya and Kakazu, Yuji and Kusano, Miyako and Tohge, Takayuki and Matsuda, Fumio and Sawada, Yuji and Hirai, Masami Yokota and Nakanishi, Hiroki and Ikeda, Kazutaka and Akimoto, Naoshige and Maoka, Takashi and Takahashi, Hiroki and Ara, Takeshi and Sakurai, Nozomu and Suzuki, Hideyuki and Shibata, Daisuke and Neumann, Steffen and Iida, Takashi and Tanaka, Ken and Funatsu, Kimito and Matsuura, Fumito and Soga, Tomoyoshi and Taguchi, Ryo and Saito, Kazuki and Nishioka, Takaaki},
title = {MassBank: a public repository for sharing mass spectral data for life sciences},
journal = {Journal of Mass Spectrometry},
volume = {45},
+ number = {7},
pages = {703-714},
+ keywords = {MassBank, public database, distributed database, metabolite, spectral similarity},
doi = {10.1002/jms.1777},
- url = {https://massbank.eu/MassBank/Search},
+ url = {https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/abs/10.1002/jms.1777},
+ eprint = {https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/pdf/10.1002/jms.1777},
+ abstract = {Abstract MassBank is the first public repository of mass spectra of small chemical compounds for life sciences (<3000 Da). The database contains 605 electron-ionization mass spectrometry(EI-MS), 137 fast atom bombardment MS and 9276 electrospray ionization (ESI)-MSn data of 2337 authentic compounds of metabolites, 11 545 EI-MS and 834 other-MS data of 10 286 volatile natural and synthetic compounds, and 3045 ESI-MS2 data of 679 synthetic drugs contributed by 16 research groups (January 2010). ESI-MS2 data were analyzed under nonstandardized, independent experimental conditions. MassBank is a distributed database. Each research group provides data from its own MassBank data servers distributed on the Internet. MassBank users can access either all of the MassBank data or a subset of the data by specifying one or more experimental conditions. In a spectral search to retrieve mass spectra similar to a query mass spectrum, the similarity score is calculated by a weighted cosine correlation in which weighting exponents on peak intensity and the mass-to-charge ratio are optimized to the ESI-MS2 data. MassBank also provides a merged spectrum for each compound prepared by merging the analyzed ESI-MS2 data on an identical compound under different collision-induced dissociation conditions. Data merging has significantly improved the precision of the identification of a chemical compound by 21–23\% at a similarity score of 0.6. Thus, MassBank is useful for the identification of chemical compounds and the publication of experimental data. Copyright © 2010 John Wiley \& Sons, Ltd.},
year = {2010}
}
-
diff --git a/topics/metabolomics/tutorials/gcms/tutorial.md b/topics/metabolomics/tutorials/gcms/tutorial.md
index a3c92ff4ef8acb..b91fea4d33c5e0 100644
--- a/topics/metabolomics/tutorials/gcms/tutorial.md
+++ b/topics/metabolomics/tutorials/gcms/tutorial.md
@@ -1,38 +1,72 @@
---
layout: tutorial_hands_on
-draft: true
+draft: false
title: 'Mass spectrometry : GC-MS analysis with metaMS package'
level: Introductory
-zenodo_link : 'https://zenodo.org/record/3244991'
-questions :
- - What are the main steps of GC-MS datas processing for metabolomic analysis ?
- - How te be able to annotate the maximum of unknowns using Galaxy ?
-objectives :
- - To be sure you have already comprehend the diversity of MS pre-processing analysis.
- - To learn the principal functions of metaMS package through Galaxy.
- - To evaluate the potential of this new GC-MS workflow for GC-MS metabolomic analysis.
-time_estimation : 2H
-key_points :
- - Have a good file containing all your peaks during the first stopover
- - Find all your unknowns in your datas
- - Find your stanards if you have some
-requirements :
- - type: "internal"
- topic_name: metabolomics
- tutorials:
- - lcms
-contributors :
+zenodo_link: 'https://zenodo.org/records/16538501'
+questions:
+- What are the main steps for gas chromatography-mass spectrometry (GC-MS) data processing for untargeted metabolomic analysis?
+- How to conduct metabolomic GC-MS data analysis from preprocessing to annotation using Galaxy?
+objectives:
+- To be sure you have already comprehend the diversity of MS pre-processing analysis.
+- To learn the principal functions of metaMS package for GC-MS data processing through Galaxy.
+- To evaluate the potential of two workflow approaches and available Galaxy tools when dealing with GC-MS metabolomic analysis.
+time_estimation: 2H
+key_points:
+- To process untargeted GC-MS metabolomic data preprocessing, you need a large variety of steps and tools.
+- Although main steps are standard, various ways to combine and to set parameters for tools exist, depending on your data.
+- Resources are available in Galaxy, but do not forget that you need appropriate knowledge to perform a relevant analysis.
+requirements:
+- type: "internal"
+ topic_name: metabolomics
+ tutorials:
+ - gcms
+contributions:
+ authorship:
- jsaintvanne
-
+ - yguitton
+ editing:
+ - yguitton
+ - melpetera
+ - jsaintvanne
+ testing:
+ - workflow4metabolomics
+funding:
+ - metabohub
+ - rfmf
+
---
+You may already know that there are different types of *-omic* sciences; out of these, metabolomics is most closely related to phenotypes. Metabolomics involves the study of different types of matrices, such as blood, urine, tissues, in various organisms including plants. It focuses on studying the very small molecules which are called *metabolites*, to better understand matters linked to the metabolism. However, studying metabolites is not a piece of cake since it requires several critical steps which still have some major bottlenecks. Metabolomics is still quite a young science, and has many kinds of specific challenges.
+
+
+One of the three main technologies used to perform metabolomic analysis is **Gas-Chromatography Mass Spectrometry** (GC-MS). Data analysis for this technology requires a large variety of steps, ranging from extracting information from the raw data, to statistical analysis and annotation. Many packages in R/Python are available for the analysis of GC-MS or LC-MS data - for more details see the reviews by {% cite Stanstrup2019 %} and {% cite Misra2021 %}.
+
+This tutorial explains the main steps involved in untargeted **GC-MS** data processing, to do so we focus on some open-source solutions integrated within the Galaxy framework, namely **XCMS** and **metaMS**. The selected tools and functionalities only covers a small portion of available tools but allow to **perform a complete GC-MS analysis** in a single environment.
+In this tutorial, we will learn how to (1) extract features from the raw data using **XCMS** ({% cite Smith2006 %}), (2) deconvolute the detected features into spectra with **metaMS** ({% cite Wehrens2014 %}) and (3) annotate unknow spectra using spectral database comparison tools.
+
+To illustrate this approach, we will use data from {% cite Dittami2012 %}. Due to time constraints in processing the original dataset, a limited subset of samples was utilized to illustrate the workflow. This subset (see details below) demonstrates the key steps of metabolomics analysis, from pre-processing to annotation. Although the results derived from this reduced sample size may not be scientifically robust, they provide insight into essential methodological foundations of GC-MS data-processing workflow.
+
-A lot of packages are available for the analysis of GC-MS or LC-MS data. Typically, hardware vendors provide software that is optimized for the instrument and allow a direct interaction of the lab scientist with the data. Some other open-source alternatives such as **XCMS** are also able to be integrated easily in web interfaces, allowing large numbers of files to be processed simultaneously. Because of the generality of packages like **XCMS**, several other packages have been developed to use the functionality of **XCMS** for optimal performance in a particular context. Package **metaMS** does so for the field of untargeted metabolomics, focuses on the GC-MS analysis during this tutorial. One of the goals **metaMS** was to set up a simple system with few user-settable parameters, capable of handling the vast majority of untargeted metabolomics experiments.
+> Algae samples
+>
+> The objective of this study was to investigate the adaptation mechanisms of the brown algae *Ectocarpus* to low-salinity environments. The research focused on examining physiological tolerance and metabolic changes in freshwater and marine strains of *Ectocarpus*. Using transcriptomic (gene expression profiling) and metabolic analyses, the authors identified significant, reversible changes occurring in the freshwater strain when exposed to seawater. Both strains exhibited similarities in gene expression under identical conditions; however, substantial differences were observed in metabolite profiles.
+>
+> The study utilized a **freshwater strain** of Ectocarpus and a **marine strain** for comparative analysis. The algae were cultured in media with varying salinities, prepared by diluting natural seawater or adding NaCl. The algae were acclimated to these conditions before extraction.
+>
+> The six samples used in this training were analyzed by GC-MS (low resolution instrument). A marine strain raised in sea water media (2 replicates) and freshwater strains raised in either 5% or 100% sea water media (2 replicates each). [Dataset available on Zenodo](https://zenodo.org/records/16538501)
+>
+{: .details}
-During this tutorial, we will learn how to process easily a test dataset from raw files to the annotation using W4M Galaxy. Datas are from {% cite Dittami2012 %} and have been used as test dataset for the development of the Galaxy wrappers.
+To process the GC-MS data, we can use several tools. One of these is **XCMS** ({% cite Smith2006 %}) it's a general R package for untargeted metabolomics profiling. It can be used for any type of mass spectrometry acquisition (centroid and profile) or resolution (from low to high resolution), including FT-MS data coupled with a different kind of chromatography (liquid or gas). Because of the generality of packages like **XCMS**, several other packages have been developed to use the functionality of **XCMS** for optimal performance in a particular context. The R package **metaMS** ({% cite Wehrens2014 %}) does so for the field of GC-MS untargeted metabolomics. One of the goals of **metaMS** was to set up a simple system with few user-settable parameters, capable of handling untargeted metabolomics experiments.
+In this tutorial we use **XCMS** to detect chromatographic peaks within our samples. Once we have detected them, they need to be deconvoluted into mass spectra representing chemical compounds. For that, we use **metaMS** functions. To normalize the retention time of deconvoluted spectra in our sample, we compute the retention index using Alkane references and a dedicated function of **metaMS**. Finally, we identify detected spectra by aligning them with a database of known compounds. This can be achieved using an in-house built database in the common MSP format (`.msp`) (used in the NIST MS search program for example), resulting in a table of annotated compounds.
+>
+> In Galaxy other GC-MS data processing workflows are available and may be of interest for more advanced Galaxy users [Link to Metabolomics GTN]({% link /topics/metabolomics/ %})
+{: .comment}
+
>
>
> In this tutorial, we will cover:
@@ -43,87 +77,133 @@ During this tutorial, we will learn how to process easily a test dataset from ra
{: .agenda}
-# First steps of pre-processing using a standard XCMS workflow (mandatory)
+# Data preparation and prepocessing
-The first step of the workflow is the pre-processing of the raw data with **XCMS** ({% cite Smith2006 %}).
-{: .text-justify}
+Before we can start with the actual analysis pipeline, we first need to download and prepare our dataset. Many of the preprocessing steps can be run in parallel on individual samples. Therefore, we recommend using the Dataset collections in Galaxy. This can be achieved by using the dataset collection option from the beginning of your analysis when uploading your data into Galaxy.
-**XCMS** {% icon tool %} is a free and open-source software dedicated to pre-processing any type of mass spectrometry acquisition files from low to high resolution, including FT-MS data coupled with different kind of chromatography (liquid or gas). This software is used worldwide by a huge community of specialists in metabolomics.
-{: .text-justify}
-This software is based on different algorithms that have been published, and is provided and maintained using R software.
-{: .text-justify}
+## Import the data into Galaxy
-**MSnbase readMSData** {% icon tool %} function, prior to **XCMS**, is able to read files with open format as `mzXML`, `mzML`, `mzData` and `netCDF`, which are independent of the constructors' formats. The **XCMS** package itself is composed of R functions able to extract, filter, align and fill gap, with the possibility to annotate isotopes, adducts and fragments (using the R package CAMERA, {% cite CAMERA %}). This set of functions gives modularity, and thus is particularly well adapted to define workflows, one of the key points of Galaxy.
-{: .text-justify}
-
-First step of this tutorial is to download the data test. As describe in the introduction, we will use datas from {% cite Dittami2012 %}. We will only process on a subset of their data.
-So, you can **import your files directly in Galaxy from Zenodo (see hands-on below)** or download files into your computer using the following link then upload them on Galaxy:
-{: .text-justify}
-
-[](https://doi.org/10.5281/zenodo.3631074)
-
-
-Then, to be able to pre-process our GC-MS data, we need to **start with the peakpicking of MS data**.
-One Galaxy Training material already explains how to act with MS data. We encourage you to **follow this link and complete the corresponding tutorial**: [Mass spectrometry: LC-MS preprocessing with XCMS]({% link topics/metabolomics/tutorials/lcms-preprocessing/tutorial.md %}).
-For GC-MS analysis you **don't really need to follow all of this previous tutorial** but for a better understanding of your data, it is recommanded to try it with their test dataset.
-Concerning the current GC-MS tutorial, you **just have to compute the following steps and specific parameters** described in the hands-on part below (please follow parameters values to have the same results during the training).
-{: .text-justify}
-
-## 1 - Import the data into Galaxy
-
-> Data upload
+> Upload data
>
> 1. Create a new history for this tutorial
>
> {% snippet faqs/galaxy/histories_create_new.md %}
>
-> 2. Import the following 6 `mzData` files into a collection named `mzData`
-> - Option 1: from a shared data library (ask your instructor)
-> - Option 2: from Zenodo using the URLs given below
+> 2. Import the files from [Zenodo]({{ page.zenodo_link }}) into a collection:
>
> ```
-> https://zenodo.org/record/3631074/files/alg11.mzData
-> https://zenodo.org/record/3631074/files/alg2.mzData
-> https://zenodo.org/record/3631074/files/alg3.mzData
-> https://zenodo.org/record/3631074/files/alg7.mzData
-> https://zenodo.org/record/3631074/files/alg8.mzData
-> https://zenodo.org/record/3631074/files/alg9.mzData
+> https://zenodo.org/records/16538501/files/alg11.mzML
+> https://zenodo.org/records/16538501/files/alg2.mzML
+> https://zenodo.org/records/16538501/files/alg3.mzML
+> https://zenodo.org/records/16538501/files/alg7.mzML
+> https://zenodo.org/records/16538501/files/alg8.mzML
+> https://zenodo.org/records/16538501/files/alg9.mzML
> ```
>
-> {% snippet faqs/galaxy/datasets_import_via_link.md collection=true format="mzml" collection_name="mzData" renaming=false %}
->
-> {% snippet faqs/galaxy/datasets_import_from_data_library.md astype="as a Collection" %}
+> {% snippet faqs/galaxy/datasets_import_via_link.md collection=true format="mzml" collection_name="input" renaming=false %}
>
-> 3. Make sure your data is in a **collection**. Make sure it is named `mzData`
-> - If you forgot to select the collection option during import, you can create the collection now:
+> 3. Make sure your data is in a **collection**. You can always manually create the collection from separate files:
>
> {% snippet faqs/galaxy/collections_build_list.md %}
>
-> 4. Import the following 2 files from Zenodo or from a shared data library (ask your instructor).
-> Beware: these files must not be in a collection.
+> In the further steps, this dataset collection will be referred to as `input` (and we recommend naming this collection like that to avoid confusion).
+>
+> 4. Import the following extra files from [Zenodo]({{ page.zenodo_link }}):
>
> ```
-> https://zenodo.org/record/3631074/files/sampleMetadata.tsv
-> https://zenodo.org/record/3631074/files/W4M0004_database_small.msp
+> https://zenodo.org/record/16538501/files/reference_alkanes.csv
+> https://zenodo.org/record/16538501/files/W4M0004_database_small.msp
+> https://zenodo.org/record/16538501/files/sampleMetadata.tsv
> ```
>
> {% snippet faqs/galaxy/datasets_import_via_link.md %}
>
-> {% snippet faqs/galaxy/datasets_import_from_data_library.md %}
>
+> > The extra files
+> >
+> > The three additional files contain the **reference_alkanes**, the **W4M0004_database_small**, and the **sampleMetadata**. Those files are auxiliary inputs used in the data processing and contain either extra information about the samples or serve as reference data for indexing and identification.
+> >
+> > The **reference_alkanes** (`.tsv` or `.csv`) with retention times and carbon number or retention index is used to compute the retention index of the deconvoluted peaks. The alkanes should be measured in the same batch as the input sample collection.
+> >
+> > The **W4M0004_database_small** (`.msp`) is a reference database used for the identification of spectra. It contains the recorded and annotated mass spectra of chemical standards, ideally from a similar instrument. The unknown spectra which can be detected in the sample can then be confirmed via comparison with this library. The specific library is an in-house library of metabolite standards extracted with **metaMS** .
+> >
+> > The **sample metadata** (`.csv` or `.tsv`) is a table containing information about our samples. In particular, the tabular file contains for each sample its associated sample name, class (SW, FWS, etc.). It is possible to add more columns to include additional details about the samples (e.g : batch number, injection order...).
+> {: .comment}
>
{: .hands_on}
-## 2 - First steps using XCMS
+As a result of this step, you should have in our history a green Dataset collection {% icon param-collection %} with all 6 samples `.mzML` files as well as three separate files with reference alkanes, reference spectral library, and sample metadata.
+
+## Create the XCMS object
+
+The first part of data processing is using the **XCMS** tool to detect peaks in the MS signal. For that, we first need to take the `.mzML` files and create a format usable by the **XCMS** tool. {% tool [MSnbase readMSData](toolshed.g2.bx.psu.edu/repos/lecorguille/msnbase_readmsdata/msnbase_readmsdata/2.16.1+galaxy0) %} ({% cite gatto2012msnbase %}. {% cite gatto2020msnbase %}) takes as input our files and prepares `RData` files for the first **XCMS** step.
+
-> First steps using a standard XCMS workflow
+>
+> **MSnbase readMSData** {% icon tool %} function, prior to **XCMS**, is able to read files with open format as `mzXML`, `mzML`, `mzData` and `netCDF`, which are independent of the manufacturers' proprietary formats. Working with open MS data file format allows users to us tools developed outside of the MS instrument provider. This set of packages/functions gives modularity, and thus is particularly well adapted to define workflows, one of the key points of Galaxy.
+>
+{: .comment}
+
+> Create the XCMS object
>
> 1. {% tool [MSnbase readMSData](toolshed.g2.bx.psu.edu/repos/lecorguille/msnbase_readmsdata/msnbase_readmsdata/2.16.1+galaxy0) %} with the following parameters:
-> - {% icon param-collection %} *"File(s) from your history containing your chromatograms"*: `mzData` (Input dataset collection)
+> - {% icon param-collection %} *"File(s) from your history containing your chromatograms"*: `input`
+>
+> {% snippet faqs/galaxy/tools_select_collection.md %}
+>
+> The output should be `input.raw.RData`
+>
+> This dataset is a collection of `rdata.msnbase.raw` files. That `Rdata` file is necessary in the next step of the workflow. It contains an internal R representation of **XCMS** objects.
+>
+{: .hands_on}
+
+As a result of this step, you should have in our history a new green dataset called `input.raw.RData` that will serve as input for next processing step: **peak picking**.
+
+# Peak detection using XCMS
+
+The first step in the workflow is to detect the peaks in our data using **XCMS** functions. This part, however, is covered by a [separate tutorial]({{ site.baseurl }}/topics/metabolomics/tutorials/lcms-preprocessing/tutorial.html). Although the tutorial is dedicated to LC-MS data, it can also be followed for our GC-MS data. Therefore, in this section, we do not explain this part of the workflow in detail but rather refer the reader to the dedicated tutorial. Please also pay attention to the parameter values for individual Galaxy tools, as these can differ from the referred tutorial and are adjusted to our GC-MS dataset.
+
+
+> Skip peak detection step
+>
+> Since this step is already covered in a [separate tutorial]({{ site.baseurl }}/topics/metabolomics/tutorials/lcms-preprocessing/tutorial.html), it is possible to skip it. Instead, you can go directly to [Peak deconvolution]({{ site.baseurl }}/topics/metabolomics/tutorials/gcms/tutorial.html#processing-with-metams-option-1) step using a preprocessed **XCMS** object file prepared for you.
>
-> 2. {% tool [xcms findChromPeaks (xcmsSet)](toolshed.g2.bx.psu.edu/repos/lecorguille/xcms_xcmsset/abims_xcms_xcmsSet/3.12.0+galaxy0) %} with the following parameters:
-> - {% icon param-file %} *"RData file"*: `mzData.raw.RData` (output of the **MSnbase readMSData** {% icon tool %} job)
+> > Upload data
+> >
+> > 1. Create a new history for this tutorial
+> >
+> > {% snippet faqs/galaxy/histories_create_new.md %}
+> >
+> > 2. Import the following files from [Zenodo]({{ page.zenodo_link }}):
+> >
+> > ```
+> > https://zenodo.org/records/16538501/files/xset.merged.RData
+> > ```
+> >
+> > {% snippet faqs/galaxy/datasets_import_via_link.md %}
+> >
+> > The format of uploaded file containing **XCMS** object should be `rdata.xcms.fillpeaks` or `rdata`.
+> >
+> > {% snippet faqs/galaxy/datatypes_understanding_datatypes.md %}
+> >
+> {: .hands_on}
+>
+{: .details}
+
+
+The first step (*called peak picking*) is to extract peaks from each of your data files independently. For this purpose, we use the _MatchedFilter_ chromatographic peak detection algorithm implemented in {% tool [xcms findChromPeaks (xcmsSet)](toolshed.g2.bx.psu.edu/repos/lecorguille/xcms_xcmsset/abims_xcms_xcmsSet/3.12.0+galaxy0) %}.
+
+ One Galaxy Training material already explains how to act with MS data. We encourage you to **follow this link and complete the corresponding tutorial**: [Mass spectrometry: LC-MS preprocessing with XCMS]({% link topics/metabolomics/tutorials/lcms-preprocessing/tutorial.md %}).
+For GC-MS analysis you **don't really need to follow all of this previous tutorial** but for a better understanding of your data, it is recommended to try it with their test dataset.
+Concerning the current GC-MS tutorial, you **just have to compute the following steps and specific parameters** described in the hands-on part below (please follow the parameter values below to obtain the same results during the training).
+
+
+
+> Peak picking of GC-MS data with XCMS
+>
+> 1. {% tool [xcms findChromPeaks (xcmsSet)](toolshed.g2.bx.psu.edu/repos/lecorguille/xcms_xcmsset/abims_xcms_xcmsSet/3.12.0+galaxy0) %} with the following parameters:
+> - {% icon param-collection %} *"RData file"*: `input.raw.RData` (the output collection of files from the **MSnbase readMSData** {% icon tool %} step)
> - *"Extraction method for peaks detection"*: `MatchedFilter - peak detection in chromatographic space`
> - *"Full width at half maximum of matched filtration gaussian model peak"*: `5`
> - *"Step size to use for profile generation"*: `0.5`
@@ -135,50 +215,177 @@ Concerning the current GC-MS tutorial, you **just have to compute the following
>
> >
> >
-> > With GC-MS data in profile mode, we need to use the *MatchedFilter* algorithm instead of the *Centwave* one used in the LC-MS tutorial.
+> > For GC-MS data in profile mode, use the *MatchedFilter* algorithm instead of the *CentWave* algorithm used in the LC-MS tutorial.
+> > For low-resolution GC-MS data, a larger m/z difference (0.5 Da) can be used to separate overlapping peaks. If you have high-resolution data (from GC-ToF or GC-Orbitrap), you can use a smaller m/z difference (0.01 Da).
> {: .comment}
>
-> 3. {% tool [xcms findChromPeaks Merger](toolshed.g2.bx.psu.edu/repos/lecorguille/xcms_merge/xcms_merge/3.12.0+galaxy0) %} with the following parameters:
-> - {% icon param-file %} *"RData file"*: `mzData.raw.xset.RData` (output of the **xcms findChromPeaks (xcmsSet)** {% icon tool %} job)
+> 2. {% tool [xcms findChromPeaks Merger](toolshed.g2.bx.psu.edu/repos/lecorguille/xcms_merge/xcms_merge/3.12.0+galaxy0) %} with the following parameters:
+> - {% icon param-collection %} *"RData file"*: `input.raw.xset.RData` (output collection of files from the **xcms findChromPeaks (xcmsSet)** {% icon tool %} job)
> - {% icon param-file %} *"Sample metadata file "*: `sampleMetadata.tsv` (One of the uploaded files from Zenodo)
>
> >
> >
> > To merge your data, you need to **input a sampleMetadata file** containing filenames and their metadata informations like their class for example.
> > If you don't add a sampleMetadata file, the **xcms findChromPeaks Merger** {% icon tool %} tool will **group all your files together**.
-> > You can also **create your sampleMetadata file** with W4M Galaxy tool **xcms get a sampleMetadata file** {% icon tool %} with the following parameters: *"RData file"* outputed from **MSnbase readMSData** {% icon tool %}.
-> > Here is an example of the minimum expectations about a sampleMetadata file (**important**: don't write the format of the file, just their names):
+> > You can also **create your sampleMetadata file** with W4M Galaxy tool {% tool [xcms get a sampleMetadata file](toolshed.g2.bx.psu.edu/repos/lecorguille/xcms_export_samplemetadata/xcms_export_samplemetadata/3.12.0+galaxy3) %} with the following parameters: {% icon param-collection %} *"RData file"*: `input.raw.RData` output from **MSnbase readMSData** {% icon tool %}.
+> > Here is an example of the minimum expectations about a sampleMetadata file (**important**: remove file extension from the sample names, file1.mzML should be file1):
> >
> >
> > | sample_name | class |
> > |:-----------:|:-------:|
-> > | file1 | man |
-> > |-------------+---------|
-> > | file2 | woman |
-> > |-------------+---------|
-> > | file3 | man |
+> > | file1 | marine strain |
+> > | file2 | pool |
+> > | file3 | freshwater strain |
> >
> {: .comment}
>
{: .hands_on}
-The output from **xcms findChromPeaks Merger** {% icon tool %} is a *.RData* file that is mandatory to proceed with the end of the extraction process.
+The output from **xcms findChromPeaks Merger** {% icon tool %} is an *.RData* file required for the next steps of the process.
There are two available options:
-- carrying on using the standard XCMS workflow similarly to the one used in the LC-MS tutorial
-- using the metaMS strategy specifically designed for GC-MS data
-The two options are illustrated in this tutorial.
+- using the metaMS strategy specifically designed for GC-MS data deconvolution and annotation
+- using a full XCMS process for GC-MS data processing
+
+The two options are illustrated in this tutorial.
+
+{% include _includes/cyoa-choices.html option1="Deconvolution and annotation using metaMS" option2="Process GC-MS data with XCMS function" default="Deconvolution and annotation using metaMS" text="Choose below if you just want to follow the pipeline using **metaMS** or **XCMS** for GC-MS deconvolution and annotation" disambiguation="gcms_metams_vs_xcms" %}
+
+
+
+# Processing with metaMS (option 1)
+
+**metaMS** is an R package for MS-based metabolomics data. It was made to ease GC-MS data deconvolution and alignement steps using functions from **XCMS** and **CAMERA** packages. In its Galaxy implementation, the two main outputs of **metaMS** are: (1) a table of feature intensities in all samples, which can be analyzed with multivariate methods immediately, and (2) an MSP (`.msp`) file containing GC-MS spectra in a common spectral database format.
+The biggest difference between **XCMS** only workflow (option 2) or **XCMS + metaMS** GC-MS data processing (option 1) is that rather than a feature-based analysis with individual peaks, as in the option 2 case, **metaMS** performs a pseudospectrum-based analysis and use it to align compound between samples. One other advantage is that **metaMS** allows creation of MSP (`.msp`) spectra export files ready for annotation.
+
+>
+> > Not all **metaMS** R package functions have been made available in Galaxy.
+When run in R, the **metaMS** package offers a lot of possibilities. For more information on the full set of metaMS functions, visit the [metaMS Bioconductor page](https://www.bioconductor.org/packages/release/bioc/html/metaMS.html).
+{: .comment}
+
+During this part of the tutorial we are interested in GC-MS analysis with **metaMS**, so we will use the *runGC* function of **metaMS** and describe it in detail to understand all the capabilities of that function.
+The standard workflow of **metaMS** for GC-MS data is the following:
+
+
+
+The *runGC* function is implemented in **metaMS.runGC {% icon tool %}** tool in Galaxy. It takes as inputs an {% icon param-collection %} *.RData* file after **XCMS** peak picking and optionally for annotation purposes an alkane reference file (in `.csv` format) for RI calculation and/or a spectral database in `.msp` format.
+
+
+## Deconvolution and Alignement with metaMS
+
+The peak picking is performed by the usual **XCMS** functions and the output file in `.RData` is used for deconvolution and Alignment steps with _runGC_ function.
+
+> metaMS.runGC
+>
+> 1.{% tool [metaMS.runGC](toolshed.g2.bx.psu.edu/repos/yguitton/metams_rungc/metams_runGC/3.0.0+metaMS1.24.0-galaxy0) %} with the following parameters:
+> - {% icon param-file %} *"Rdata from xcms and merged"*: `xset.merged.RData` (output of the **xcms findChromPeaks Merger** {% icon tool %} step)
+> - *"Settings"* : `user_defined`
+> - *"RT range option"* : `hide` (If set to *show* you can limit the range of RT processed, for example remove solvant delays)
+> - *"RT_Diff"* : `0.05` (Max time deviation in minute to cluster unknown pseudo-spectra between samples)
+> - *"Min_Features* : `5` (Minimal number of features required to have a valid pseudo-spectrum, compound with less ions will be discarded)
+> - *"similarity_threshold"* : `0.7` (Minimum cosine similarity between pseudo-spectra to be considers as equal)
+> - *"min.class.fract"* : `0.5` (Minimal fraction of samples in which a pseudo-spectrum should be present to be kept)
+> - *"min.class.size"* : `2` (Minimum number of samples in which a pseudo-spectrum should be find)
+> - *"Use Personnal DataBase option"* : `show` ( this activate the *"DB file"* selector)
+> - {% icon param-file %} *"DB file"* : `W4M0004_database_small.msp` (The file download from Zenodo, if not available set the *"Use Personnal DataBase option"* to *hide*)
+> - *"Use RI option* : `show` (choose *hide* if you want to skip RI calculation)
+> - {% icon param-file %} *"RI file"* : `reference_alkanes.csv` (Format should be strictly respected)
+> - *"Use RI as filter"* :`FALSE` (If set to TRUE only unknown spectra with close RI as those in database will be kept)
+> - *"RIshift"* : *"not used"*
+>
+> >
+> >
+> > For faster processing keep annotation modules *off* by setting *"Use Personnal DataBase option"* : `hide` and *"Use RI option* : `hide`
+> {: .comment}
+>
+> > Export MSP file to external databases
+> > >You can {% icon dataset-save %} download the MSP file and open it in your favorite spectra processing software or online database for further investigation!
+> {: .tip}
+>
+{: .hands_on}
+
+> Definitions
+> > Pseudo-spectra
+> > The biggest difference between **XCMS** only or **XCMS + metaMS** GC-MS data processing is that rather than a feature-based analysis with individual peaks, as is the case with **XCMS**, **metaMS** performs a pseudospectrum-based analysis. So, the basic entity is a set of m/z values showing a chromatographic peak at the same retention time. The idea behind that is that Electron Ionization (EI), which is the most widely used ionization mode in GC-MS analysis, generates a lot more ions for the same molecule than Electrospray Ionisation used in LC-MS. The function _runGC_ from **metaMS** is able to group all ions belonging to a molecule into one single cluster that will be used for statistical analysis. For each compound found by **metaMS** a list of gouped m/z and their intensity is exported as pseudospectrum and this will be used for annotation purpose. For that the MSP file format is used, which is a common format for mass spectra databases. The pseudospectra are created by grouping all m/z values of a chromatographic peak at the same retention time into one single entry, and then exporting this information in the `.msp` format.
+> >
+> >  and the associated pseudospectra (right)")
+> >
+> > This choice is motivated by several considerations. First of all, **in GC the amount of overlap is much less than in LC** : peaks are much narrower. This means that even a one- or two-second difference in retention time can be enough to separate the corresponding mass spectra. Secondly, EI MS spectra for many compounds are **available in extensive libraries like the [NIST library](http://www.nist.gov/srd/nist1a.cfm "NIST library")** or other online ones like [Golm Metabolome library](http://gmd.mpimp-golm.mpg.de/)
+> {: .details}
+>
+> > MSP files
+> > MSP (Mass Spectrum Peak) file is a text file structured according to the NIST MSSearch spectra format. MSP is one of the generally accepted formats for mass spectral libraries (or collections of unidentified spectra, so called spectral archives), and it is compatible with lots of spectra processing programs (MS-DIAL, NIST MS Search, AMDIS, matchms, etc.). It can contain one or more mass spectra, which are split by an empty line. The individual spectra essentially consist of two sections: metadata (such as name, spectrum type, ion mode, retention time, and the number of m/z peaks) and peaks, consisting of m/z and intensity tuples.
+> >
+> > **Example of an MSP file entry:**
+> > ```
+> > Name: Biomarker1
+> > Retention_time: 5.23
+> > Num Peaks: 3
+> > 43 100
+> > 57 80
+> > 71 60
+> > ```
+> >
+> > For more details, you can view an [example MSP file](https://zenodo.org/records/16538501/files/W4M0004_database_small.msp).
+> {: .details}
+{: .details}
+
+## Alignement
+
+Once **metaMS** have created the pseudo-spectra for each unknown compound in each files, we can start the annotationalignement process. This is done by **comparing every pseudospectrum ** to each others in order to group/align similar MS spectra between samples. As a similarity measure, the weighted dot product is used as it is fast, simple, and gives good results ({% cite Stein1994 %}). The first step in the comparison is based on retention, since a comparison of either retention time or retention index is much faster than a spectral comparison. Since the weighted dot product uses scaled mass spectra, the scaling of the database is done once, and then used in all comparisons. If a pseudo-spectra Y from sample A is similar to pseudo-spectra X in sample B and they have close retention (time or index). This process will create the *dataMatrix* and *variableMetadata* outputs were aligned pseudo-spectra for different samples will belongs to the same ligne in the final *variableMetadata* and will be concidered as Unknown compound X.
+
+
+ and a database entry (blue)")
+
+If an MSP database have been added to the *runGC* function inputs then the function returns a table where all patterns that have a match with a DB entry are shown with their name, the other pseudo-spectra will be named UnknownX in the first column of the *variableMetadata* and *dataMatrix*.
+
+
+
+## Unknowns research
+
+An important aspect of untargeted metabolomics is the definition of unknowns—features that occur repeatedly in a minimum number or fraction of samples (as defined by the `min.class.fract` and `min.class.size` parameters in the metaMS settings), but for which no annotation has been found. In **metaMS**, these unknown features are found by comparing all patterns (i.e., pseudo-spectra which are groups of features) within a certain retention time (or retention index) difference on their spectral characteristics.
+
+One strenght of **metaMS** is its ability to use pseudo-spectre (1) for alignement of unknows between samples and (2) to compare unknown experimental pseudo-spectra to previously created in-house spectra databse (in MSP format). By doing so **metaMS** *runGC* function can serve as an annotation tool. You just have to set - *"Use Personnal DataBase option"* : `show` and add you in-house database file as input.
+
+The *runGC* process will always create an MSP file as output (either with only unknown spectra or with a mix of annotatetd ones and unknowns). That MSP file can be used for database search online (as Golm ({% cite Kopka2005 %}) and MassBank ({% cite Horai2010 %})) or locally (NIST MSSEARCH) for NIST search a [PDF tutorial is available](https://workflow4metabolomics.org/sites/default/files/fichiers/documents/w4m_HowToUseNIST_V01.pdf).
+
+For large numbers of samples, this process can take quite some time (it scales quadratically), especially if the allowed difference in retention time is large. The result now is a list of two elements : the first is the annotation table that we also saw after the comparison with the database, and the second is a list of pseudo-spectra corresponding to unknowns.
+
+## Outputs and results
+
+At this stage, all elements are complete : we have the list of pseudo-spectra with an annotation, either as a chemical standard from the database, or an unknown occurring in a sizeable fraction of the injections. The only things left to do is to calculate relative intensities for the pseudo-spectra, and to put the results in an easy-to-use table. This table consists of two parts. The first part is the information on the “features”, which here are the pseudo-spectra. The second part of the table contains the intensities of these features in the individual injections.
+
+
+
+
+The first five lines are the standards, and the next ones are the unknowns that are identified by the pipeline. In the manual interpretation of this kind of data, the intensities of one or two “highly specific” features are often used to achieve relative quantitation. In an automatic pipeline, this is a risky strategy: not only can the intensity of a peak vary quite dramatically (relative standard deviations of up to 30% are assumed acceptable in GC-MS, e.g. when SPME is applied), but these errors are all the more pronounced in high-intensity peaks (hence the common use of a relative standard deviation).
+
+In addition, one is ignoring the information in the other peaks of the pseudospectrum. In **metaMS**, pseudospectrum intensity is expressed as a multiple of the corresponding reference pattern (either a database pattern or an unknown), where the intensity ratio is determined using robust regression to avoid one deviating feature to influence the results too much ({% cite Wehrens2014 %}). First, we define an object containing all relevant pseudo-spectra, and next the intensities are generated.
+
+In both cases, the result is a list containing a set of patterns corresponding with the compounds that have been found, either annotated or unknown, the relative intensities of these patterns in the individual annotations, and possibly the xcmsSetobject for further inspection. In practice, the *runGC* function is all that users need to use.
-# Carrying on using the standard XCMS workflow (option 1)
+# Take a look at your results after metaMS processing
+
+ The [xcm plot chromatogram]{% tool toolshed.g2.bx.psu.edu/repos/lecorguille/xcms_plot_chromatogram/xcms_plot_chromatogram/3.12.0+galaxy3 %} part allows users to see the TIC (Total Ion Chromatogram), BPC (Base Peak Chromatogram).
+
+If you separated your samples into different classes, this tool can constructs TICs and BPCs one class against one class, in a `pdf` file (Figure 5) :
+
+
+
+
-This option follows the standard LC-MS workflow to obtain in the end a dataMatrix file and its corresponding variableMetadata file.
+
+
+# Process GC-MS data with a full XCMS workflow (option 2)
-> Example of end of extraction when using the standard XCMS workflow
+This option follows the standard **XCMS** workflow with GC-MS data at start to obtain in the end a *dataMatrix* file and its corresponding *variableMetadata* file. The main difference with the [option 1](#processing-with-metams-option-1) is that the *dataMatrix* file will contain individual peaks rather than pseudo-spectra, and the *variableMetadata* file will contain information about each peak, such as its retention time, m/z, and intensity. **No** `.msp` file will be generated in this case, as the peaks are not grouped into pseudo-spectra so the annotation proces will be different.
+
+> Example untargeted GC-MS data processing with the standard XCMS workflow
>
> 1. {% tool [xcms groupChromPeaks (group)](toolshed.g2.bx.psu.edu/repos/lecorguille/xcms_group/abims_xcms_group/3.12.0+galaxy0) %} with the following parameters:
> - {% icon param-file %} *"RData file"*: `xset.merged.RData` (output of the **xcms findChromPeaks Merger** {% icon tool %} job)
> - *"Method to use for grouping"*: `PeakDensity - peak grouping based on time dimension peak densities`
-> - *"Bandwidth"*: `10.0`
-> - *"Width of overlapping m/z slices"*: `0.05`
+> - *"Bandwidth"*: `5.0`
+> - *"Width of overlapping m/z slices"*: `0.5`
>
> 2. {% tool [xcms fillChromPeaks (fillPeaks)](toolshed.g2.bx.psu.edu/repos/lecorguille/xcms_fillpeaks/abims_xcms_fillPeaks/3.12.0+galaxy0) %} with the following parameters:
> - {% icon param-file %} *"RData file"*: `xset.merged.groupChromPeaks.RData` (output of the **xcms groupChromPeaks (group)** {% icon tool %} job)
@@ -187,53 +394,18 @@ This option follows the standard LC-MS workflow to obtain in the end a dataMatri
> - *"Number of decimal places for retention time values reported in ions' identifiers."*: `2`
> - *"Reported intensity values"*: `maxo`
>
->
+> >
+> > After **XCMS** extraction of MSP formated spectra can be done with **RAMclustR**{% icon tool %} tool (See this [GTN on GC-MS data processing]({% link topics/metabolomics/tutorials/gc_ms_with_xcms/tutorial.md %})).
+> {: .comment}
{: .hands_on}
-The outputs of this strategy are similar to the ones discribed in the LC-MS tutotial mentioned previously.
-
-> Important : Be careful of the file format
->
-> During each step of pre-processing, your dataset has its format changed and can have also its name changed.
-> To be able to continue to MSMS processing, you need to have a RData object wich is **merged and grouped** (from **xcms findChromPeaks Merger** {% icon tool %} and **xcms groupChromPeaks (group)** {% icon tool %}) at least.
-> It means that you should have a file named `xset.merged.groupChromPeaks.RData` (and maybe with some step more in it).
-{: .comment}
+The outputs of this strategy are similar to the ones described in the LC-MS tutotial mentioned previously.
-# Stopover : Verify your data after the XCMS pre-processing
+Before going to the next step of your GC-MS data processing, here are some questions to be able to verify if your files are ready and if you have the same results as us. Please check these questions :
-When you have processed **all or only needed** steps described before, you can continue with the MS/MS processing part with **msPurity** package.
-Don't forget to always check your files format! For the next step you need to have this file `xset.merged.groupChromPeaks.*.RData` where * is the name of **optionnal** steps you could do during the pre-processing.
-For our example, your file should be named `xset.merged.groupchromPeaks.RData`.
-{: .text-justify}
->
->
-> The pre-processing part of this analysis can be **quite time-consuming**, and already corresponds to quite a few number of steps, depending of your analysis. We highly recommend, at this step of the MS/MS workflow, to split your analysis by beginning a new Galaxy history with **only the files you need** (final xset Rdata file and your data collection of mzML). This will help you in limiting selecting the wrong dataset in further analysis, and bring a little **tidiness** for future review of your MS/MS analysis process. You should also be able to make a better peakpicking in the future in the same history and it will not be polluated by MS/MS part of your process.
-> {: .text-justify}
->
-> > Copy dataset to a new history
-> >
-> > 1. Click on the {% icon galaxy-gear %} icon (**History options**) on the top of the history panel
-> > 2. Click on **Copy Dataset**
-> > 3. Select the desired files
-> > 4. Give a relevant name to the "New history"
-> > 5. Click on the new history name in the green box that have just appear to switch to this history
-> {: .tip}
->
-> To begin a new history with the files from your current history, you can **use the functionality ‘copy dataset’** and copy it into a new history (the option is hidden behind the notched wheel at the top right of the history).
-> {: .text-justify}
->
-> You may have notice that the XCMS tools generate **output names that contain the different XCMS steps you used**, allowing easy traceability while browsing your history. Hence, we highly recommend you to rename it **with something short**, e.g. "xset", "XCMSSetObject", or anything not too long that you may find convenient.
-> {: .text-justify}
-> {% snippet faqs/galaxy/datasets_rename.md %}
->
-{: .comment}
-
-Before the next step with msPurity package on MS/MS datas, here are some questions to be able to verify if your file is ready and if you have the same results as us. Please check these questions :
-{: .text-justify}
-
-> before MS/MS steps
+> before going to further GC-MS processing steps
>
> **1** - What are the steps of XCMS you made before your final file ?
> >
@@ -241,7 +413,7 @@ Before the next step with msPurity package on MS/MS datas, here are some questio
> > Here are the different steps made for our example :
> > - **(Not with XCMS)** import your datas into Galaxy instance
> > - **MSNbase readMSData** {% icon tool %} to read our MS datas
-> > - XCMS peakpicking with **xcms findChromPeaks (xcmsSet)** {% icon tool %} tool
+> > - XCMS peak picking with **xcms findChromPeaks (xcmsSet)** {% icon tool %} tool
> > - (Not with XCMS but necessary) merge my datas into one file with **xcms findChromPeaks Merger** {% icon tool %} tool
> > - XCMS grouping with **xcms groupChromPeaks (group)** {% icon tool %} tool
> > - **(Not done)** XCMS retention time correction, then grouping again with xcms adjustRtime (retcor) {% icon tool %} tool
@@ -253,7 +425,7 @@ Before the next step with msPurity package on MS/MS datas, here are some questio
> >
> >
> > During each step of XCMS pre-processing, the name of the file which is processing is completed by the name of the step you were doing. So, finally your file should be name `xset.merged.groupChromPeaks.fillChromPeaks.RData`. That because (as seen in previous answer) you ran a grouping and the integration after merged datas.
-> > {: .text-justify}
+> >
> >
> {: .solution}
>
@@ -261,137 +433,46 @@ Before the next step with msPurity package on MS/MS datas, here are some questio
> >
> >
> > To be able to see the size of a file in your history, you just have to select it. It will deploy informations about it and you can see the size of yours. For our example, the size of the final file is **1.4 MB**.
-> > {: .text-justify}
+> >
> >
> {: .solution}
{: .question}
+
-# Processing with metaMS part (option 2)
-
-**metaMS** is a R package for MS-based metabolomics data. It can do basic peak picking and grouping using functions from **XCMS** and **CAMERA** packages. The main output of **metaMS** is a table of feature intensities in all samples which can be analyzed with multivariate methods immediately. The package also offers the possibility to create in-house databases of mass spectra (including retention information) of pure chemical compounds. These databases can then be used for annotation purposes. The most important functions of this package are *runGC* and *runLC* (and each one to create databases *createSTDdbGC* and *createSTDdbLC*).
-{: .text-justify}
-During this tutorial we are interested in GC-MS analysis, so we will use the *runGC* function of **metaMS** and described it in details to be able to understand this function. The standard workflow of **metaMS** for GC-MS data is the following :
-
-
-
-The *runGC* function is implemented in **metaMS.runGC {% icon tool %} tool** in W4M Galaxy. It takes a vector of file names, corresponding to the samples, and a settings list as mandatory arguments. In addition, some extra arguments can be provided. In particular, a database of standards, as discussed later in the tutorial, can be provided for annotation purposes. This tool regroups all these steps that are described in the following parts to be able to understand all its functionalities and particularities. We will run the tool after we understand each of its steps because it is important to know what are the best parameters for our data and why each parameter is done.
-{: .text-justify}
-
-
-## Peak picking
-
-The peak picking is performed by the usual **XCMS** functions. A function has been written in **metaMS** to allow the individual parameters to be passed to the function as a settings list. The result is that the whole of the **XCMS** functionality is available, simply by changing the values of some settings, or by adding fields.
- {: .text-justify}
-Whereas the package is not up-to-date since the new version of **XCMS** (3.x). This new version brought a lot of new objects and transformed the peak picking process. To have the last version of this process, **metaMS** authorized **to start its function directly with the file containing all peak picking results**.
-{: .text-justify}
-Due to this update, **we have already processed the peak picking during the first part** of this tutorial. So we can continue it with the file outputted from the peak picking part. This also allow us to make a good peak picking without the following step include in **metaMS** functions. So it takes less time of processing and we can verify our peaks with this cut between peak picking and the following steps of GC-MS analysis.
-{: .text-justify}
-
-## Definition of pseudo-spectra
-
-Rather than a feature-based analysis with individual peaks, as is the case with **XCMS**, **metaMS** performs a pseudospectrum-based analysis. So, the basic entity is a set of m/z values showing a chromatographic peak at the same retention time.
-{: .text-justify}
-
-
-
-This choice is motivated by several considerations. First of all, **in GC the amount of overlap is much less than in LC** : peaks are much narrower. This means that even a one- or two-second difference in retention time can be enough to separate the corresponding mass spectra. Secondly, fragmentation patterns for many compounds are **available in extensive libraries like the [NIST library](http://www.nist.gov/srd/nist1a.cfm "NIST library")**. In addition, the spectra are somewhat easier to interpret since adducts, such as found in LC, are not present. The main advantage of pseudo-spectra, however, is that their use allows the results to be interpreted directly as relative concentrations of chemical compounds : **a fingerprint in terms of chemical composition is obtained**, rather than a fingerprint in terms of hard-to-interpret features. The pseudo-spectra are obtained by simply clustering on retention time, using the *runCAMERA* function, which for GC data calls *groupFWHM*. All the usual parameters for the *groupFWHM* function are included in W4M Galaxy **metaMS.runGC {% icon tool %} tool**. The most important parameter is *perfwhm*, which determines the maximal retention time difference of features in one pseudospectrum.
-{: .text-justify}
-
-The final step is to convert the **CAMERA** objects into easily handled lists, which are basically the R equivalent of the often-used `msp` format from the AMDIS software ({% cite Stein1999 %}). The `msp` file is a nested list, with one entry for each sample, and each sample represented by a number of fields. The pseudo-spectra are three-column matrices, containing m/z, intensity and retention time information, respectively. They can be draw with the *plotPseudoSpectrum* function of **metaMS** package easily (Figure 2).
-{: .text-justify}
-
-
-## Annotation
-
-Once we have identified our pseudo-spectra, we can start the annotation process. This is done by **comparing every pseudospectrum to a database of spectra**. As a similarity measure, we use the weighted dot product as it is fast, simple, and gives good results ({% cite Stein1994 %}). The first step in the comparison is based on retention, since a comparison of either retention time or retention index is much faster than a spectral comparison. The corresponding function is *matchSamples2DB*. Since the weighted dot product uses scaled mass spectra, the scaling of the database is done once, and then used in all comparisons.
-{: .text-justify}
-
- and a database entry (blue)")
-
-This *matchSamples2DB* function returns a table where all patterns that have a match with a DB entry are shown in the first column, and the DB entry itself in the second column. If for a particular experimental pattern more than one match is found, the alternatives (with a lower match factor) are shown in the last column. To see the match for a particular pattern, one can use the function *matchExpSpec*, returning matchfactors (numbers between 0 and 1, where the latter means a perfect match) for all entries in the database (if the plotIt argument is TRUE, the best match is shown – see Figure 2). Samples may contain compounds that are not of any interest, such as plasticizers, internal standards, column material etc.... These can be filtered out before doing an annotation : **metaMS** allows certain categories of database entries (defined in slot *matchIrrelevants* of the settings object) to be removed before further annotation. If the spectra of these compounds are very specific (and they often are), the retention criterion may be bypassed by setting the maximal retention time difference to very high values, which leads to the removal of such spectra wherever they occur in the chromatogram.
-{: .text-justify}
-
-
-## Unknowns research
-The most important aspect of untargeted metabolomics is the definition of unknowns, patterns that occur repeatedly in several samples, but for which no annotation has been found. In **metaMS** these unknowns are found by comparing all patterns within a certain retention time (or retention index) difference on their spectral characteristics. The same match function is used, but the threshold may be different from the threshold used to match with the database of standards. Likewise, the maximum retention time(index)difference may be different, too.
-
-In defining unknowns we have so far used settings that are more strict than when compared to a database : since all samples are typically measured in one single run, expected retention time differences are rather small. In addition, one would expect reproducible spectra for a single compound. A true unknown, or at least an interesting one, is also present in a significant fraction of the samples. All these parameters are gathered in thebetweenSampleselement of the settingsobject .Since the matching is done using scaled patterns, we need to create a scaled version of the experimental pseudo-spectra first.
-{: .text-justify}
-
-For large numbers of samples, this process can take quite some time (it scales quadratically), especiallyif the allowed difference in retention time is large. The result now is a list of two elements : the first is the annotation table that we also saw after the comparison with the database, and the second is a list of pseudo-spectra corresponding to unknowns. In the annotation table, negative indices correspond to the pseudo-spectra in this list.
-{: .text-justify}
-
-
-## Outputs and results
-
-At this stage, all elements are complete : we have the list of pseudo-spectra with an annotation, either as a chemical standard from the database, or an unknown occurring in a sizeable fraction of the injections. The only things left to do is to calculate relative intensities for the pseudo-spectra, and to put the results in an easy-to-use table. This table consists of two parts. The first part is the information on the “features”, which here are the pseudo-spectra. The second part of the table contains the intensities of these features in the individual injections.
-{: .text-justify}
-
-
-
-The first five lines are the standards, and the next ones are the unknowns that are identified by the pipeline. In the manual interpretation of this kind of data, the intensities of one or two “highly specific” features are often used to achieve relative quantitation. In an automatic pipeline, this is a risky strategy: not only can the intensity of a peak vary quite dramatically (relative standard deviations of up to 30% are assumed acceptable in GC-MS, e.g. when SPME is applied), but these errors are all the more pronounced in high-intensity peaks (hence the common use of a relative standard deviation).
-
-In addition, one is ignoring the information in the other peaks of the pseudospectrum. In **metaMS**, pseudospectrum intensity is expressed as a multiple of the corresponding reference pattern (either a database pattern or an unknown), where the intensity ratio is determined using robust regression to avoid one deviating feature to influence the results too much ({% cite Wehrens2014 %}). First, we define an object containing all relevant pseudo-spectra, and next the intensities are generated.
-{: .text-justify}
-
-In both cases, the result is a list containing a set of patterns corresponding with the compounds that have been found, either annotated or unknown, the relative intensities of these patterns in the individual annotations, and possibly the xcmsSetobject for further inspection. In practice, the *runGC* function is all that users need to use.
-{: .text-justify}
-
-That file can be used for database search online (as Golm ({% cite Kopka2005 %}) and MassBank ({% cite Horai2010 %})) or locally (NIST MSSEARCH) for NIST search a tutorial is available here.
-{: .text-justify}
-
-> metaMS.runGC
->
-> We now know each step of this *runGC* function. So, please open the **metaMS.runGC {% icon tool %} too** to run it. You should enter the following parameters for our tutorial :
-> - **Rdata from xcms and merged** : here you have to select your file from **XCMS** where you made the peak picking, grouping and all the pre-processing. It should be named `xset.merged.groupdChromPeaks.RData`.
-> - **Settings** : you can keep it at *user_default* but to see all possible parameters please set it at `use_defnied`.
-> - **RT range option** : it ables to select a region of retention time. If you select to *show* it, you have to enter the window in minutes, separate by a coma (for example 5,20 to have results between 5 minutes and 20 minutes). For our tutorial, we `keep it to hide`.
-> - **RT_Diff** : it is the allowed retention time difference in minutes between the same compound/unknown in different sample. For our tutorial, `keep it at 0.05` to have low differences between unknowns' retention times.
-> - **Min_Features** : this parameter is used during the comparison with database or unknowns. It corresponds to the minimal number of features required to have a valid pseudospectrum. For our tutorial, please `keep it to 5` to have really good compounds.
-> - **similarity_threshold** : this parameter is also used for comparison. It is the minimum similarity allowed between peaks mass spectra to be considers as equal. For our tutorial, please `keep it to 0.7`.
-> - **min.class.fract** : it corresponds to the minimal fraction of samples in which a pseudospectrum is present before it is regarded as an unknown. For the tutorial, please `keep it to 0.5`.
-> - **min.class.size** : it corresponds to the minimum number of samples in which a pseudospectrum should be present before it is regarded as an unknown. For our tutorial, please `set it to 2` because we have classes with only 2 samples.
-> - **Use Personnal DataBase option** : you can compare your datas to a personnal database. If you want to do it start to choose `show` in this parameter. Then you will be able to select your file. If not, keep it to `hide` and you will only have unknowns as results.
-> - **DB file** : this parameter will appear if you choose to show it. You just have to `select your database in your file` to add this here. Be careful, this database has to respect some rules (please look at *?????????????????? part*).
-> - **Use RI option** : choose if you want to use the RI for standards.
-> - **RI file** : enter here `your RI file` which have to contains two columns : retention time and retention indices.
-> - **Use RI as filter** : just to know if you want to use RI parameter as a filter.
-> - **RIshift** : if you want to use RI as filter, please precise here the RI shift. For our tutorial `keep the previous parameter to FALSE`.
+># Verify your data after the pre-processing and clean datasets
>
+>When you have processed **all or only needed** steps described before, you can continue the processing of your data with statistics or annotation tools.
+Don't forget to always check your files format!
>
-{: .hands_on}
-
-
-# Take a look at your results after metaMS processing
-
-We choose to separate our first W4M Galaxy tool into 2 parts: the processing of GC-MS data (**metaMS.runGC {% icon tool %}**) and the plotting results of these data (**metaMS.plot {% icon tool %}**). So we now have the first part describes just before and the second part we will describe just after. This part allows users to see the TIC (Total Ion Chromatogram), BPC (Base Peak Chromatogram), and also all EICs (Extracted Ion Chromatogram) you want, from our previous result outputted from **metaMS.runGC {% icon tool %} tool**.
-{: .text-justify}
-
-If you separated your samples into different classes, this tool can constructs TICs and BPCs one class against one class, in a `pdf` file (Figure 5) :
-{: .text-justify}
-
-
-
-Concerning EICs, it is possible to choose for which compound you want to draw an EIC when you run the W4M Galaxy tool. According to your choice, you will obtain EICs for one compound in each sample you enter in the previous **metaMS** part.
-{: .text-justify}
-
-
-
-> metaMS.plot
+>The pre-processing part of this analysis can be **quite time-consuming**, and already corresponds to quite a few number of steps, depending of your analysis. We highly recommend, at this step of the GC-MS workflow, to split your analysis by beginning a new Galaxy history with **only the files you need** for further steps (final `.tsv` matrices - sampleMetadata, variableMetadata, dataMatrix and the `.msp` spectral database). This will help you in limiting the chance to select the wrong dataset in further analysis, and bring a little **tidiness** for future review of your analysis process. You should also be able to make adjust peak picking parameters in the future in the same history and it will not be polluted by statistical analysis part of your process.
+>
+>
+> > Copy dataset to a new history
+> >
+> > 1. Click on the {% icon galaxy-gear %} icon (**History options**) on the top of the history panel
+> > 2. Click on **Copy Dataset**
+> > 3. Select the desired files
+> > 4. Give a relevant name to the "New history"
+> > 5. Click on the new history name in the green box that have just appear to switch to this history
+> >
+> {: .tip}
+>
+> To begin a new history with the files from your current history, you can **use the functionality ‘copy dataset’** and copy it into a new history (the option is hidden behind the notched wheel at the top right of the history).
>
-> This tool is very easy to run. It is an obligation to process **metaMS.runGC {% icon tool %}** before this one. After that, you just have to choose if you want or not to draw your TIC, BPC or EIC :
-> - **Rdata from new_metaMS_runGC** : the file you obtained with the **metaMS.runGC {% icon tool %}** tool. It should be named `runGC.RData`.
-> - **Do you want to process for TIC(s) ?** : if you select "yes" you will obtain the `pdf` file containing each TIC from each class against each others.
-> - **Do you want to process for BPC(s) ?** : if you select "yes" you will obtain the `pdf` file containing each BPC from each class against each others.
-> - **Do you want to process for EIC(s) ?** : if you select "yes" you will have to choose which compound(s) and unknown(s) you want to obtain its EIC.
-> - **EIC_Unknown** : here please choose which compound(s) or unknown(s) you want to obtain according to the `peaktable.tsv` file. For out tutorial it can be interesting to have a look at **all** the EICs. So put the `value to 0`.
>
-{: .hands_on}
-
+> You may have notice that the XCMS tools generate **output names that contain the different XCMS steps you used**, allowing easy traceability while browsing your history. Hence, we highly recommend you to rename it **with something short**, e.g. "xset", "XCMSSetObject", or anything not too long that you may find convenient.
+>
+> {% snippet faqs/galaxy/datasets_rename.md %}
+>
+> > Important : Be careful of the file format
+> > During each step of pre-processing, your dataset has its format changed and can have also its name changed. To be able to continue to GC-MS processing, you need to have a RData object which is **merged and grouped** (from **xcms findChromPeaks Merger** {% icon tool %} and **xcms groupChromPeaks (group)** {% icon tool %}) at least. It means that you should have a file named `xset.merged.groupChromPeaks.RData` (and maybe with some step more in it).
+> {: .comment}
# Conclusion
+{% icon trophy %} Well done, you’ve processed GC-MS data with [metaMS (option 1)](#processing-with-metams-option-1) or [all with XCMS (option 2)](#process-gc-ms-data-with-a-full-xcms-workflow-option-2) ! You might want to explore other [Galaxy trainings for Metabolomics]({% link topics/metabolomics/ %})
+
+You might want to consult your results with the [key history](https://usegalaxy.fr/u/yguitton/h/gcms) or use [the workflow](https://usegalaxy.fr/u/yguitton/h/gcms) associated with this tutorial.
+
-
-