import argparse import sys import os import pandas HELPER_TEXT =""" .___ ___. __ ___ .______ _______ .______ | \/ | | |/ / | _ \ | \ | _ \ | \ / | | ' / | |_) | | .--. || |_) | | |\/| | | < | _ < | | | || / | | | | | . \ | |_) | | '--' || |\ \----. |__| |__| |__|\__\ |______/ |_______/ | _| `._____| _______________________________________________________________________________ Pierre-Edouard GUERIN, Laetitia MATHON, Virginie MARQUES, Stephanie MANEL CNRS, EPHE, Sorbonne University, Montpellier, France version 0.2 "Tete de veau" March 2021 Usage: > python3 mkbdr [options] For help: > python3 mkbdr --help """ def check_taxdumpdir(ncbiTaxdumpDir): nodesFile=str(ncbiTaxdumpDir)+'/nodes.dmp' namesFile=str(ncbiTaxdumpDir)+'/names.dmp' if not os.path.exists(ncbiTaxdumpDir): print('ARGUMENTS ERROR: arguments ncbi_taxdump {0}. This path not exists.'.format(ncbiTaxdumpDir)) return False if not os.path.isfile(nodesFile): print('ARGUMENTS ERROR: arguments ncbi_taxdump {0}. {1} is missing.'.format(ncbiTaxdumpDir, nodesFile)) return False elif not os.path.isfile(namesFile): print('ARGUMENTS ERROR: arguments ncbi_taxdump {0}. {1} is missing.'.format(ncbiTaxdumpDir, namesFile)) return False else: return True def check_curatecsv(curateCsvFile): curateColons = ['current_name', 'ncbi_name', 'genus', 'family', 'ncbi_rank'] if not os.path.isfile(curateCsvFile): print('ARGUMENTS ERROR: arguments curate {0}. {0} file not exists.'.format(curateCsvFile)) return False try: dfCure = pandas.read_csv(curateCsvFile, sep=";") except ValueError: print("ARGUMENTS ERROR: arguments curate {0}. {0} is not a CSV file and must have ';' as delimiter'.".format(curateCsvFile)) return False colons = list(dfCure.columns.values) for colon in curateColons: if colon not in colons: print("ARGUMENTS ERROR: arguments curate {0}. {1} colon is missing.".format(curateCsvFile, colon)) return False return True def parse_args(usage=HELPER_TEXT): parser = argparse.ArgumentParser(description='mkbdr - to build a custom metabarcoding reference database.') subprasers = parser.add_subparsers(dest='command') validate = subprasers.add_parser('validate', help='check format and taxonomy') validate.add_argument("-f","--fasta", type=str, help='path of the barcodes sequences FASTA file', required=True) validate.add_argument("-c","--curate", type=str, help='path of the taxonomy curation CSV file. Header must be current_name;ncbi_name;genus;family. A curation CSV file can be generated with the command curegen', required=False, default="NA") validate.add_argument("-n","--ncbi_taxdump", type=str, help='path of NCBI taxonomy folder', required=False, default="NA") validate.add_argument("-l","--ncbi_taxdump_load", action='store_true', help='load NCBI taxonomy folder') validate.add_argument("-e","--ncbi_taxdump_edition", action='store_true', help='allow edition of ncbi taxdump files to add new taxonomy nodes') validate.add_argument("-o","--output_prefix", type=str, help='prefix of the output FASTA such as [PREFIX].fasta') curegen = subprasers.add_parser('curegen', help='try to correct wrong taxonomy') curegen.add_argument("-f","--fasta", type=str, help='path of the barcodes sequences FASTA file', required=True) curegen.add_argument("-n","--ncbi_taxdump", type=str, help='path of NCBI taxdump.tar.gz file', required=False, default="NA") curegen.add_argument("-d","--database_globalnames", type=str, help="Name of the selected database from GlobalNames i.e. 'Catalogue of Life' or 'Fishbase Cache'", required=False, default='FishBase Cache') curegen.add_argument("-o","--output_prefix", type=str, help='prefix of the output curated taxonomy CSV such as [PREFIX].csv') curegen.add_argument("-l","--ncbi_taxdump_load", action='store_true', help='load NCBI taxonomy folder') args = parser.parse_args() if args.command not in ['validate', 'curegen']: print(usage) sys.exit(0) print("Checking arguments...", end='') ## check arguments are not faulty if args.command == 'validate': if args.curate != 'NA': if check_curatecsv(args.curate) is False: print(usage) sys.exit(0) if args.ncbi_taxdump_edition or args.ncbi_taxdump_load: if args.ncbi_taxdump == 'NA': print('ARGUMENT ERROR: ncbi_taxdump argument is needed.') print(usage) sys.exit(0) else: if check_taxdumpdir(args.ncbi_taxdump) is False: print(usage) sys.exit(0) print("done.") return args