|
- """The command line entry point for Casanovo."""
- import datetime
- import logging
- import os
- import sys
-
- import click
- import psutil
- import pytorch_lightning as pl
- import torch
- import yaml
- import glob
-
- from version import _get_version as __version__
- from data import ms_io
- from denovo import model_runner
-
- logger = logging.getLogger("casanovo")
-
-
- @click.command()
- @click.option(
- "--mode",
- required=True,
- default="train",
- help="\b\nThe mode in which to run Casanovo:\n"
- '- "denovo" will predict peptide sequences for\nunknown MS/MS spectra.\n'
- '- "train" will train a model (from scratch or by\ncontinuing training a '
- "previously trained model).\n"
- '- "eval" will evaluate the performance of a\ntrained model using '
- "previously acquired spectrum\nannotations.",
- type=click.Choice(["denovo", "train", "eval"]),
- )
- @click.option(
- "--model",
- help="The file name of the model weights (.ckpt file).",
- # type=click.Path(exists=True, dir_okay=False),
- )
- @click.option(
- "--train_only",
- help="train only without eval",
- default=True,
- type=bool,
- )
- @click.option(
- "--pretrained",
- default=None,
- help="The file name of the pretrained model weights (.ckpt file).",
- type=click.Path(exists=True, dir_okay=False),
- )
- @click.option(
- "--peak_path",
- default="sample_data/*",
- help="The file path with peak files for predicting peptide sequences or "
- "training Casanovo.",
- )
- @click.option(
- "--peak_path_val",
- default="sample_data/*",
- help="The file path with peak files to be used as validation data during "
- "training.",
- )
- @click.option(
- "--config",
- default="config_small_t5.yaml",
- help="The file name of the configuration file with custom options. If not "
- "specified, a default configuration will be used.",
- type=click.Path(exists=True, dir_okay=False),
- )
- @click.option(
- "--output",
- default="exp/log",
- help="The base output file name to store logging (extension: .log) and "
- "(optionally) prediction results (extension: .csv).",
- type=click.Path(dir_okay=False),
- )
- @click.option(
- "--batch_size",
- default=2,
- help="batch size on all GPUs",
- type=int,
- )
- @click.option(
- "--encoder_lr",
- default=None,
- help="encoder_lr",
- type=float,
- )
- def main(
- mode: str,
- model: str,
- pretrained: str,
- peak_path: str,
- peak_path_val: str,
- config: str,
- output: str,
- batch_size: int,
- train_only: bool,
- encoder_lr: float,
- ):
- """
- \b
- Casanovo: De novo mass spectrometry peptide sequencing with a transformer model.
- ================================================================================
-
- Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo
- mass spectrometry peptide sequencing with a transformer model. Proceedings
- of the 39th International Conference on Machine Learning - ICML '22 (2022)
- doi:10.1101/2022.02.07.479481.
-
- Official code website: https://github.com/Noble-Lab/casanovo
- """
- if output is None:
- output = os.path.join(
- os.getcwd(),
- f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}",
- )
- else:
- output = os.path.splitext(os.path.abspath(output))[0]
-
- # Configure logging.
- logging.captureWarnings(True)
- root = logging.getLogger()
- root.setLevel(logging.DEBUG)
- log_formatter = logging.Formatter(
- "{asctime} {levelname} [{name}/{processName}] {module}.{funcName} : "
- "{message}",
- style="{",
- )
- console_handler = logging.StreamHandler(sys.stderr)
- console_handler.setLevel(logging.DEBUG)
- console_handler.setFormatter(log_formatter)
- root.addHandler(console_handler)
- file_handler = logging.FileHandler(f"{output}.log")
- file_handler.setFormatter(log_formatter)
- root.addHandler(file_handler)
- # Disable dependency non-critical log messages.
- logging.getLogger("depthcharge").setLevel(logging.INFO)
- logging.getLogger("h5py").setLevel(logging.WARNING)
- logging.getLogger("numba").setLevel(logging.WARNING)
- logging.getLogger("pytorch_lightning").setLevel(logging.WARNING)
- logging.getLogger("torch").setLevel(logging.WARNING)
-
- # Read parameters from the config file.
- if config is None:
- config = os.path.join(
- os.path.dirname(os.path.realpath(__file__)), "config.yaml"
- )
- config_fn = config
- with open(config) as f_in:
- config = yaml.safe_load(f_in)
- # Ensure that the config values have the correct type.
- config_types = dict(
- random_seed=int,
- n_peaks=int,
- min_mz=float,
- max_mz=float,
- min_intensity=float,
- remove_precursor_tol=float,
- max_charge=int,
- precursor_mass_tol=float,
- isotope_error_range=lambda min_max: (int(min_max[0]), int(min_max[1])),
- dim_model=int,
- n_head=int,
- dim_feedforward=int,
- n_layers=int,
- dropout=float,
- dim_intensity=int,
- max_length=int,
- n_log=int,
- warmup_iters=int,
- max_iters=int,
- learning_rate=float,
- weight_decay=float,
- train_batch_size=int,
- predict_batch_size=int,
- max_epochs=int,
- num_sanity_val_steps=int,
- train_from_scratch=bool,
- save_model=bool,
- model_save_folder_path=str,
- save_weights_only=bool,
- every_n_epochs=int,
- decoder_dim_model=int,
- )
- for k, t in config_types.items():
- try:
- if config[k] is not None:
- config[k] = t(config[k])
- except (TypeError, ValueError) as e:
- logger.error("Incorrect type for configuration value %s: %s", k, e)
- raise TypeError(f"Incorrect type for configuration value {k}: {e}")
- config["residues"] = {
- str(aa): float(mass) for aa, mass in config["residues"].items()
- }
- if encoder_lr == None:
- config["encoder_lr"] = config["learning_rate"]
- else:
- config["encoder_lr"] = encoder_lr
- config["pretrained"] = pretrained
-
- # t5decoder: "pretrained_model"
- if "t5" in config_fn:
- config["t5decoder"] = "pretrained_model"
- else:
- config["t5decoder"] = None
- if not batch_size == None:
- config["train_batch_size"] = batch_size
- config["model_save_folder_path"] = output.split("/")[-2]
- os.makedirs(config["model_save_folder_path"], exist_ok=True)
- config["train_only"] = train_only
- # import pdb
- # pdb.set_trace()
- # Add extra configuration options and scale by the number of GPUs.
- n_gpus = torch.cuda.device_count()
- config["n_workers"] = n_gpus * 8
- if n_gpus > 1:
- config["n_workers"] = config["n_workers"] // n_gpus
- config["train_batch_size"] = config["train_batch_size"] // n_gpus
- pl.utilities.seed.seed_everything(seed=config["random_seed"], workers=True)
-
- # Log the active configuration.
- logger.info("Casanovo version %s", str(__version__))
- logger.debug("mode = %s", mode)
- logger.debug("model = %s", model)
- logger.debug("peak_path = %s", peak_path)
- logger.debug("peak_path_val = %s", peak_path_val)
- logger.debug("config = %s", config_fn)
- logger.debug("output = %s", output)
- for key, value in config.items():
- logger.debug("%s = %s", str(key), str(value))
-
- # Run Casanovo in the specified mode.
- if mode == "denovo":
- logger.info("Predict peptide sequences with Casanovo.")
- writer = ms_io.MztabWriter(f"{output}.mztab")
- writer.set_metadata(
- peak_path, config, model=model, config_filename=config_fn
- )
- model_runner.predict(peak_path, model, config, writer)
- writer.save()
- elif mode == "eval":
- logger.info("Evaluate a trained Casanovo model.")
- if model.split(".")[-1] == "ckpt":
- filelist = [model]
- else:
- filelist = glob.glob(os.path.join(model, "*.ckpt"))
- for file in filelist:
- logger.info(">>>>>>>>>Evaluate on %s" % file)
- model_runner.evaluate(peak_path_val, file, config)
- elif mode == "train":
- logger.info("Train the Casanovo model.")
- model_runner.train(peak_path, peak_path_val, model, config)
-
-
- if __name__ == "__main__":
- main()
|