import sys from powerOfZeroGaryConstant import * sys.path.append(PYTHON_METLIN_CLASSES) import sys from os import listdir from os.path import isfile, join import os.path import MySQLdb import MyDB import MyMETLINUtils import MyCSV import MyMol import MyMail import MyXLSX import Avanti import time import datetime from datetime import datetime import json import re import pandas as pd from pandas import ExcelWriter from pandas import ExcelFile MAX_USLEEP = 1000 usleep = lambda x: time.sleep(x/1000000.0) MOLECULE_HAVE_ANY_FRAGMENTS_POS = "MOLECULE_HAVE_ANY_FRAGMENTS_POS" # "How many molecules in positive ion mode have any fragments at 0eV" MOLECULE_HAVE_ANY_FRAGMENTS_NEG = "MOLECULE_HAVE_ANY_FRAGMENTS_NEG" # "How many molecules in negative ionization mode have any fragments at 0eV" MOLECULE_HAVE_ANY_EITHER_FRAGMENTS_POSNEG = "MOLECULE_HAVE_ANY_EITHER_FRAGMENTS_POSNEG" # "How many molecules have any fragments in either positive or negative ionization mode at 0eV" MOLECULE_HAVE_ONLY_1_FRAGMENT_POS = "MOLECULE_HAVE_ONLY_1_FRAGMENT_POS" # Ideally it would be for positive only, negative only, and both MOLECULE_HAVE_ONLY_1_FRAGMENT_NEG = "MOLECULE_HAVE_ONLY_1_FRAGMENT_NEG" # Ideally it would be for positive only, negative only, and both MOLECULE_HAVE_ONLY_1_FRAGMENT_POSNEG = "MOLECULE_HAVE_ONLY_1_FRAGMENT_POSNEG" # Ideally it would be for positive only, negative only, and both MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POS = "MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POS" # "Greater than or equal to. In positive only, in negative only, and both" MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_NEG = "MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_NEG" # "Greater than or equal to. In positive only, in negative only, and both" MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POSNEG = "MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POSNEG" # "Greater than or equal to. In positive only, in negative only, and both" MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POS = "MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POS" # "Greater than or equal to. In positive only, in negative only, and both" MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_NEG = "MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_NEG" # "Greater than or equal to. In positive only, in negative only, and both" MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POSNEG = "MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POSNEG" # "Greater than or equal to. In positive only, in negative only, and both" MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POS = "MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POS" # "Greater than or equal to 4 fragments at 0eV, in positive only, in negative only, and both" MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_NEG = "MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_NEG" # "Greater than or equal to 4 fragments at 0eV, in positive only, in negative only, and both" MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POSNEG = "MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POSNEG" # "Greater than or equal to 4 fragments at 0eV, in positive only, in negative only, and both" # 1%, 3%, 5%, and 10% PERCENT_1 = "PERCENT_1" PERCENT_3 = "PERCENT_3" PERCENT_5 = "PERCENT_5" PERCENT_10 = "PERCENT_10" PERCENT_LST = [PERCENT_1, PERCENT_3, PERCENT_5, PERCENT_10] statDict = { PERCENT_1:{ MOLECULE_HAVE_ANY_FRAGMENTS_POS:0, MOLECULE_HAVE_ANY_FRAGMENTS_NEG:0, MOLECULE_HAVE_ANY_EITHER_FRAGMENTS_POSNEG:0, MOLECULE_HAVE_ONLY_1_FRAGMENT_POS:0, MOLECULE_HAVE_ONLY_1_FRAGMENT_NEG:0, MOLECULE_HAVE_ONLY_1_FRAGMENT_POSNEG:0, MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POS:0, MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_NEG:0, MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POSNEG:0, MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POS:0, MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_NEG:0, MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POSNEG:0, MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POS:0, MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_NEG:0, MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POSNEG:0 }, PERCENT_3:{ MOLECULE_HAVE_ANY_FRAGMENTS_POS:0, MOLECULE_HAVE_ANY_FRAGMENTS_NEG:0, MOLECULE_HAVE_ANY_EITHER_FRAGMENTS_POSNEG:0, MOLECULE_HAVE_ONLY_1_FRAGMENT_POS:0, MOLECULE_HAVE_ONLY_1_FRAGMENT_NEG:0, MOLECULE_HAVE_ONLY_1_FRAGMENT_POSNEG:0, MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POS:0, MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_NEG:0, MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POSNEG:0, MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POS:0, MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_NEG:0, MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POSNEG:0, MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POS:0, MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_NEG:0, MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POSNEG:0 }, PERCENT_5:{ MOLECULE_HAVE_ANY_FRAGMENTS_POS:0, MOLECULE_HAVE_ANY_FRAGMENTS_NEG:0, MOLECULE_HAVE_ANY_EITHER_FRAGMENTS_POSNEG:0, MOLECULE_HAVE_ONLY_1_FRAGMENT_POS:0, MOLECULE_HAVE_ONLY_1_FRAGMENT_NEG:0, MOLECULE_HAVE_ONLY_1_FRAGMENT_POSNEG:0, MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POS:0, MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_NEG:0, MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POSNEG:0, MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POS:0, MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_NEG:0, MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POSNEG:0, MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POS:0, MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_NEG:0, MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POSNEG:0 }, PERCENT_10:{ MOLECULE_HAVE_ANY_FRAGMENTS_POS:0, MOLECULE_HAVE_ANY_FRAGMENTS_NEG:0, MOLECULE_HAVE_ANY_EITHER_FRAGMENTS_POSNEG:0, MOLECULE_HAVE_ONLY_1_FRAGMENT_POS:0, MOLECULE_HAVE_ONLY_1_FRAGMENT_NEG:0, MOLECULE_HAVE_ONLY_1_FRAGMENT_POSNEG:0, MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POS:0, MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_NEG:0, MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POSNEG:0, MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POS:0, MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_NEG:0, MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POSNEG:0, MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POS:0, MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_NEG:0, MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POSNEG:0 } } ################################################################################ # # # ################################################################################ def is_nan(x): return (x != x) ################################################################################ # # # ################################################################################ def getStats(molid): sql = "SELECT specid FROM msms_spec where molid = " + str(molid) + " and mode = '+' and collisionE = 0 LIMIT 1" db = MyDB.MyDB(DB_SERVER, DB_USERNAME, DB_PASSWORD, DB_NAME_METLIN) ms2SpecPosTbl = db.queryRetTbl(sql) usleep(MAX_USLEEP) sql = "SELECT specid FROM msms_spec where molid = " + str(molid) + " and mode = '-' and collisionE = 0 LIMIT 1" db = MyDB.MyDB(DB_SERVER, DB_USERNAME, DB_PASSWORD, DB_NAME_METLIN) ms2SpecNegTbl = db.queryRetTbl(sql) usleep(MAX_USLEEP) for percentItem in PERCENT_LST: intensity = str(percentItem).split("_")[1] metaboliteMS2PosTbl = [] metaboliteMS2NegTbl = [] for item in ms2SpecPosTbl: specid = str(item['specid']) sql = "SELECT * FROM metabolite_ms2 where specid = " + str(specid) + " and intensity >= " + str(intensity) + " ORDER BY intensity ASC " db = MyDB.MyDB(DB_SERVER, DB_USERNAME, DB_PASSWORD, DB_NAME_METLIN) metaboliteMS2PosTbl = db.queryRetTbl(sql) usleep(MAX_USLEEP) for item in ms2SpecNegTbl: specid = str(item['specid']) sql = "SELECT * FROM metabolite_ms2 where specid = " + str(specid) + " and intensity >= " + str(intensity) + " ORDER BY intensity ASC " db = MyDB.MyDB(DB_SERVER, DB_USERNAME, DB_PASSWORD, DB_NAME_METLIN) metaboliteMS2NegTbl = db.queryRetTbl(sql) usleep(MAX_USLEEP) if len(metaboliteMS2PosTbl) > 1: statDict[percentItem][MOLECULE_HAVE_ANY_FRAGMENTS_POS] += 1 if len(metaboliteMS2NegTbl) > 1: statDict[percentItem][MOLECULE_HAVE_ANY_FRAGMENTS_NEG] += 1 if len(metaboliteMS2PosTbl) >= 1 or len(metaboliteMS2NegTbl) >= 1: statDict[percentItem][MOLECULE_HAVE_ANY_EITHER_FRAGMENTS_POSNEG] += 1 if len(metaboliteMS2PosTbl) == 1: statDict[percentItem][MOLECULE_HAVE_ONLY_1_FRAGMENT_POS] += 1 if len(metaboliteMS2NegTbl) == 1: statDict[percentItem][MOLECULE_HAVE_ONLY_1_FRAGMENT_NEG] += 1 if len(metaboliteMS2PosTbl) == 1 or len(metaboliteMS2NegTbl) == 1: statDict[percentItem][MOLECULE_HAVE_ONLY_1_FRAGMENT_POSNEG] += 1 if len(metaboliteMS2PosTbl) >= 2: statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POS] += 1 if len(metaboliteMS2NegTbl) >= 2: statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_NEG] += 1 if len(metaboliteMS2PosTbl) >= 2 or len(metaboliteMS2NegTbl) >= 2: statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POSNEG] += 1 if len(metaboliteMS2PosTbl) >= 3: statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POS] += 1 if len(metaboliteMS2NegTbl) >= 3: statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_NEG] += 1 if len(metaboliteMS2PosTbl) >= 3 or len(metaboliteMS2NegTbl) >= 3: statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POSNEG] += 1 if len(metaboliteMS2PosTbl) >= 4: statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POS] += 1 if len(metaboliteMS2NegTbl) >= 4: statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_NEG] += 1 if len(metaboliteMS2PosTbl) >= 4 or len(metaboliteMS2NegTbl) >= 4: statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POSNEG] += 1 ################################################################################ # # # ################################################################################ def getMS2MIDs(): sql = "" if LIMIT <= 10000: sql = "SELECT DISTINCT(met.molid) FROM metabolite met, msms_spec msms, metabolite_ms2 ms2 WHERE ((msms.collisionE=0) AND msms.molid=met.molid AND msms.specid=ms2.specid) ORDER BY met.molid ASC LIMIT " + str(LIMIT) else: sql = "SELECT DISTINCT(met.molid) FROM metabolite met, msms_spec msms, metabolite_ms2 ms2 WHERE ((msms.collisionE=0) AND msms.molid=met.molid AND msms.specid=ms2.specid) ORDER BY met.molid ASC " db = MyDB.MyDB(DB_SERVER, DB_USERNAME, DB_PASSWORD, DB_NAME_METLIN) result = db.queryRetTbl(sql) return result ################################################################################ # # # ################################################################################ for x in range(20): print("\n") LIMIT = 10001 # > 10,000 runs the whole database PRINT_MAX = 0.10 count = 0 db = MyDB.MyDB(DB_SERVER, DB_USERNAME, DB_PASSWORD, DB_NAME_METLIN) mid = getMS2MIDs() totalMid = len(mid) print("Running mid count: ", totalMid) for item in mid: molid = str(item['molid']) getStats(molid) count += 1 percentCompleted = float(float(count)/float(totalMid)) * 100.0 if count % (LIMIT * PRINT_MAX) == 0: print("Molid: " + str(molid) + "; Count: " + str(count) + "/" + str(totalMid) + "; Completed: " + str(float(percentCompleted))) print("====================================================================================================================") for percentItem in PERCENT_LST: print("METLIN ID Count: " + str(len(mid))) print("") print("Intensity: " + str(percentItem).split("_")[1] + "%") print("") print("How many molecules in positive ion mode have any fragments at 0eV : " + str(statDict[percentItem][MOLECULE_HAVE_ANY_FRAGMENTS_POS])) print("How many molecules in negative ionization mode have any fragments at 0eV : " + str(statDict[percentItem][MOLECULE_HAVE_ANY_FRAGMENTS_NEG])) print("How many molecules have any fragments in either positive or negative ionization mode at 0eV : " + str(statDict[percentItem][MOLECULE_HAVE_ANY_EITHER_FRAGMENTS_POSNEG])) print("") print("How many molecules in positive ion mode have only 1 fragment at 0eV : " + str(statDict[percentItem][MOLECULE_HAVE_ONLY_1_FRAGMENT_POS])) print("How many molecules in negative ion mode have only 1 fragment at 0eV : " + str(statDict[percentItem][MOLECULE_HAVE_ONLY_1_FRAGMENT_NEG])) print("How many molecules in positive or negative ion mode have only 1 fragment at 0eV : " + str(statDict[percentItem][MOLECULE_HAVE_ONLY_1_FRAGMENT_POSNEG])) print("") print("How many molecules in positive ion mode have greater than or equal to 2 fragments at 0eV : " + str(statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POS])) print("How many molecules in negative ion mode have greater than or equal to 2 fragments at 0eV : " + str(statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_NEG])) print("How many molecules in positive or negative ion mode have greater than or equal to 2 fragments at 0eV: " + str(statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_2_FRAGMENTS_POSNEG])) print("") print("How many molecules in positive ion mode have greater than or equal to 3 fragments at 0eV : " + str(statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POS])) print("How many molecules in negative ion mode have greater than or equal to 3 fragments at 0eV : " + str(statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_NEG])) print("How many molecules in positive or negative ion mode have greater than or equal to 3 fragments at 0eV: " + str(statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_3_FRAGMENTS_POSNEG])) print("") print("How many molecules in positive ion mode have greater than or equal to 4 fragments at 0eV : " + str(statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POS])) print("How many molecules in negative ion mode have greater than or equal to 4 fragments at 0eV : " + str(statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_NEG])) print("How many molecules in positive or negative ion mode have greater than or equal to 4 fragments at 0eV: " + str(statDict[percentItem][MOLECULE_HAVE_GT_OR_EQ_4_FRAGMENTS_POSNEG])) print("====================================================================================================================")