User:Makecat/BibcodeBot.py

# -*- coding: utf-8  -*-
'''
从英文版的[[:en:User:Bibcode Bot]]抄过来，版权协议作者自己也不知道。
将编辑摘要翻译成中文+Unicode化，凡是不提交的内容都没改。还有些编码BUG。
PythonAnyWhere不让跑ADS的网址，自己电脑上跑太慢了。谁爱用就去用吧。
'''
import sys
import os
import re
import wikipedia
import urllib
import urllib2
import time
import login
import codecs
from _journal_list import *

site = wikipedia.getSite()
throttle_time = 5
errorfilepath = "_Article timeouts/"
regex = re.compile("(\{\{)|(\}\})")
m_codes = {}
# m_codes[u"AJ..."]=u"."
                
username = "Makecat-bot"
print "Logging in as Makecat-bot..."
login
print "Logged in!"

def main():
  with codecs.open("_Article list.txt", encoding="utf-8") as f:
    print "Starting run! \n--------------------------------------"
    for line in f:
      line = unicode(line.strip(u" \t\r\n\*\[\]"))
      print "Getting page: " + line
      global page
      page = wikipedia.Page(site, line)
      if not page.exists():
        print "Page does not exist! Skipping to next article.\n--------------------------------------"
        continue
      if page.isRedirectPage():
        oldpage = line
        page  = page.getRedirectTarget()
        newpage = page
        newpage = line.strip(" \[\]")
        print "   '" + str(oldpage) + "' redirects to '" + str(newpage) + "'.\n   Processing '" + str(newpage) + "' instead."
      if page.canBeEdited() is False:
        print "Page cannot be not editted due to protection! Skipping to next article.\n--------------------------------------"
        continue
      if not page.botMayEdit(username):
        print "Page cannot be editted by a bot. Skipping to next article.\n--------------------------------------"
        continue
      text = page.get()
      orig_text = text
      text = parse_template(text)
      if text is not orig_text:
        if id_to_arxiv_count is not 0:
          print "\nConverting " + str(id_to_arxiv_count) + " \"id = {{arxiv|...}}\" to \"|arxiv=...\"."
          print "\nAdding " + str(arxiv_count) + " arxiv eprint(s), " + str(bibcode_count) + " bibcode(s) and " + str(doi_count) + " doi(s) in " + str(counter) + " possible templates."
          editsummary = u"\n转换" + str(id_to_arxiv_count) + u"\"id = {{arxiv|...}}\"为|arxiv=...\"。\n添加" + str(arxiv_count) + u"个[[arXiv]]，" + str(bibcode_count) + u"个[[bibcode]]和" + str(doi_count) + u"个[[DOI]]。"
        else:
          editsummary = u"\n添加" + str(arxiv_count) + u"个[[arXiv]]，" + str(bibcode_count) + u"个[[bibcode]]和" + str(doi_count) + u"个[[DOI]]。"
        try:
          page.put(text, editsummary + u"出现错误？请至[[User talk:Makecat]]提出。", maxTries = 2)
        except wikipedia.MaxTriesExceededError:
            try:
                print "Couldn't send data to Wikipedia.  Saving page data to " + errorfilepath + page.title()
                f = open(errorfilepath + page.title() + ".txt", "w")
                f.write(text.encode("utf-8"))
                f.close()
            except:
                print "Error saving data to file.  Printing page:\n\n\n\n\n"
                print text
      else:
        print "\nNo new arxiv eprint, new bibcode or new doi found in this article. Processing next article.\n--------------------------------------"
    print "\nRun complete!"

def get_bibcode(data): #data object that is returned from parse template
    # Bibcode format is YYYYJJJJJVVVVMPPPPA
    # YYYY  = Year
    # JJJJJ = Journal code
    # VVVV  = Volume
    # M     = Section code / Headache
    # PPPP  = Page
    # A     = First letter of the last name of the first author
  global unknown_journal
  # Extract the year part
  if "year" not in data and "date" in data and re.search("[12][0-9]{3}", data["date"]) != None:
      data["year"] = re.search("[12][0-9]{3}", data["date"]).group(0)
  if "year" in data and not "date" in data and re.search("[12][0-9]{3}", data["year"]) != None:
      data["year"] = re.search("[12][0-9]{3}", data["year"]).group(0)
  if "year" not in data != None:
      print "*** YEAR ERROR *** - No year found in citation."
      unknown_journal = "None"
      return False
  else:
    bibcode=u"%s" % data["year"]  
  # Let"s figure out the correct journal so we can get the JJJJJ value
  jkey = ""
  if not data.has_key("journal"):
    if data.has_key("work"):
      data["journal"] = data["work"]
    elif data.has_key("periodical"):
      data["journal"] = data["periodical"]
    else:
      print "*** JOURNAL ERROR *** - No journal found in citation."
      unknown_journal = "None"
      return False
  if data["journal"]:
    if data["journal"].lower().startswith("the "):
      data["journal"] = data["journal"][4:].strip()
    if data["journal"].endswith("."):
      data["journal"] = data["journal"].strip(".")
    for key in journals.keys():
      for item in journals[key]:
        # second part of the tuple is a boolean for regex
        if item[1]:
          if re.search(item[0],data["journal"]):
            jkey = key
            break
        # if its not a regex lets escape it and search for the title
        else:
          if item[0].lower().startswith("the "):
            item[0] = item[0][4:].strip()
          if item[0].endswith("."):
            item[0] = item[0].strip(".")
          if data["journal"].lower() == item[0].lower():
            jkey = key
            break
  if jkey == "":
    print "*** JOURNAL ERROR *** - Unknown journal detected (" + data["journal"] + ")."
    unknown_journal = data["journal"]
    return False
  else:
    unknown_journal = "None"
  # using the J key lets see if there is an M code defined
  if m_codes.has_key(jkey):
    m_code = m_codes[jkey]
  else:
    # default to . otherwise
    m_code = "."
  bibcode+= jkey
  pad_str=u""
  # lets get the volume number and then define the VVVV value
  if not data.has_key("volume"):
    print "*** VOLUME ERROR *** - No volume found in citation."
    return False
  else:
    try:
      data["volume"] = re.search(r"\d+", data["volume"]).group(0)
    except:
      print "*** VOLUME ERROR *** - Volume found (" + data["volume"] +"), but not parsable."
      return False
    pad = 4-len(data["volume"])
    while pad>0:
      pad=pad-1
      pad_str+=u"."
    bibcode+=pad_str+data["volume"]
  # boolean to see if we ignore the M code later
  ignore_m = False
  # handle both page and pages parameters
  pg = False
  pg_0 = False
  pg_1 = False
  if data.has_key("page"):
    if re.search("L\d+",data["page"],re.I):
      m_code = u"L"
    if re.search("\d+",data["page"],re.I):
      pg_0 = re.search(ur"\d+",data["page"],re.I).group(0)
    else:
      pg_0 = False
  if data.has_key("pages"):
    if re.search("L\d+",data["pages"],re.I):
      m_code = u"L"
    if re.search("\d+",data["pages"],re.I):
      pg_1 = re.search(ur"\d+",data["pages"],re.I).group(0)
    else:
      pg_1 = False
  if not pg_0 and not pg_1:
    print "*** PAGE ERROR *** - No page detected."
    return False
  else:
    if pg_1:
      pg = pg_1
    else:
      pg = pg_0
  if not data.has_key("page") and not data.has_key("pages"):
    print "*** PAGE ERROR *** - No page detected."
    return False  
  # lets define PPPP and wether or not M should be ignored
  # if its less than 4 lets pad it, if its 4 exactly lets skip ahead
  if len(pg)<4:
    pad_str=u""
    pad = 4-len(pg)
    while pad>0:
      pad=pad-1
      pad_str+=u"."
    pg = pad_str+pg
  elif len(pg)==5:
    # if its 5 M should be ignored and the 5th page number should be used instead
    ignore_m = True
  elif len(pg)==6:
    # if its 6 convert the last 2 to a letter and ignore M
    ignore_m = True
    alpha = "abcdefghijklmnopqrstuvwxyz"
    lettercode = alpha[int(pg[:1])]
    pg = lettercode+pg[2:]
  # now to combine everything
  if ignore_m:
    m_code =""
  if data.has_key("last1"):
    a = data["last1"][0]
  elif data.has_key("last"):
    a = data["last"][0]
  else:
    a = "."
  return bibcode+m_code+pg+a

def parse_template(text):
  #Kingpin's regex:  \{\{cite\sjournal(((\{\{.+\}\})|[^{]([{][^{])?)+)\}\}
  found = findalltemplates(text)
  global counter
  counter = 0
  global id_to_arxiv_count
  id_to_arxiv_count = 0
  global arxiv_count
  arxiv_count = 0
  global bibcode_count
  bibcode_count = 0
  global doi_count
  doi_count = 0
  unknown_journal_list = "None"
  for item in found:
    #Used to compare the result at the end of the processing
    old_item = item
    #Pre-cleaner (cleans |id={{arxiv|foobar}} to |arxiv=foobar)
    if re.search("{{\s*arxiv", item):
      if re.findall(u"\|(\s*)id(\s*)=(\s*){{\s*arxiv\s*\|\s*(archive)?\s*=?\s*([^(\||}|\s)]*)\s*\|?\s*(id)?\s*=?\s*([^(\||}|\s)]*)(\s*)}}", item):
        clean_str = re.sub(u"\|(\s*)id(\s*)=(\s*){{\s*arxiv\s*\|\s*(archive)?\s*=?\s*([^(\||}|\s)]*)\s*\|?\s*(id)?\s*=?\s*([^(\||}|\s)]*)(\s*)}}",ur"|\1arxiv\2=\3\5/\7\8", item)
        if re.findall(u"\|(\s*)arxiv(\s*)=(\s*)(.*)/(\s*)(\||})", clean_str):
          clean_str = re.sub(u"\|(\s*)arxiv(\s*)=(\s*)(.*)/(\s*)(\||})",ur"|\1arxiv\2=\3\4\5\6", clean_str)
        id_to_arxiv_count += 1
        item = clean_str
      
    global unknown_journal
    unknown_journal = "None"
    counter += 1    
    pairs = re.finditer(u"(?P<key>\w+)\s*=\s*(?P<value>.*?)(\n\s*|\||\})",item)
    data = {}
    for pair in pairs:
      key = pair.group("key").strip()
      value = pair.group("value").strip(u" []\t\r\n")
      value = re.sub(r"<!--.*?-->", "", value, flags=re.DOTALL)
      if len(value)>0:
        data[key] = value

    # The following gets rids of the error messages if any of last1/last/year/date/etc... is missing
    # This is used to build a message more explicit than "Examining citation 15"
    # Such as "Schwartz (2000). MISSING JOURNAL, v.194, p.123"
    # The code might be stupid and weird, but it seems to work just fine
    # -Headbomb
    if "last1" not in data != None:
      if "last" not in data != None:
        author_message = "MISSING AUTHOR"
      else:
        author_message = data["last"]
    else:
      author_message = data["last1"]
    if "year" not in data and "date" in data and re.search("[12][0-9]{3}", data["date"]) != None:
      data["year"] = re.search("[12][0-9]{3}", data["date"]).group(0)
    if "year" in data and not "date" in data and re.search("[12][0-9]{3}", data["year"]) != None:
      data["year"] = re.search("[12][0-9]{3}", data["year"]).group(0)
    if "year" not in data != None:
      year_message = "MISSING YEAR"
    else:
      year_message = data["year"]
    if "journal" not in data != None:
      if "work" not in data != None:
        if "periodical" not in data != None:
          journal_message = "MISSING JOURNAL"
        else:
          journal_message = data["periodical"]
      else:
        journal_message = data["work"]
    else:
      journal_message = data["journal"]
    if "volume" not in data !=None:
      volume_message = "MISSING"
    else:
      volume_message = data["volume"]
    if "pages" not in data != None:
      if "page" not in data != None:
        page_message = "MISSING"
      else:
        page_message = data["page"]
    else:
      page_message = data["pages"]
    if "arxiv" not in data != None:
      arxiv_message = "MISSING"
    else:
      arxiv_message = data["arxiv"]
    if "bibcode" not in data != None:
      bibcode_message = "MISSING"
    else:
      bibcode_message = data["bibcode"]
    if "doi" not in data != None:
      doi_message = "MISSING"
    else:
      doi_message = data["doi"]	
	  
    #Message identifying what citation we"re dealing with
    print "\nExamining citation " + str(counter)
    print "   " + author_message + " (" + year_message + "). " + journal_message + ", v." + volume_message + ", p." + page_message
    print "   arxiv  : " + arxiv_message
    print "   bibcode: " + bibcode_message
    print "   doi    : " + doi_message
    
    #Safety net for now. Will be removed later
    arxiv = False
    arxiv_exists = False
    bibcode = False
    bibcode_exist = False
    doi = False
    doi_exists = False
    new_str = None
    
    #ARXIV, BIBCODE AND DOI ARE FOUND --> DO NOTHING    
    if data.has_key("arxiv") and data.has_key("bibcode") and data.has_key("doi"):
      print "arxiv (OK), bibcode (OK), doi (OK). Nothing to do."
      arxiv = data["arxiv"]
      arxiv_exists = True
      bibcode = data["bibcode"]
      bibcode_exists = True
      doi = data["doi"]
      doi_exists = True
    
    #ARXIV NOT FOUND, BIBCODE FOUND, DOI FOUND --> SEARCH FOR ARXIV 
    if not data.has_key("arxiv") and data.has_key("bibcode") and data.has_key("doi"):
      print "arxiv (??), bibcode (OK), doi (OK). Searching for arxiv."
      query = ADS_query(arxiv, data["bibcode"], data["doi"], item)
      arxiv = query[0]
      if arxiv != "NOT FOUND!":
        arxiv_exists = True
        arxiv_count += 1
      else:
        arxiv_exists = False
      bibcode = data["bibcode"]
      bibcode_exists = True
      doi = data["doi"]
      doi_exists = True
      if arxiv_exists:
        if re.search(u"\|(\s*)arxiv(\s*)=( *)", item):
          new_str = re.sub(u"\|(\s*)arxiv(\s*)=( *)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
        else:
          new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)

    #ARXIV FOUND, BIBCODE NOT FOUND, DOI FOUND --> SEARCH FOR BIBCODE
    if data.has_key("arxiv") and not data.has_key("bibcode") and data.has_key("doi"):
      print "arxiv (OK), bibcode (??), doi (OK). Searching for bibcode."
      query = ADS_query(data["arxiv"], bibcode, data["doi"], item)
      arxiv = data["arxiv"]
      arxiv_exists = True
      bibcode = query[1]
      if bibcode != "NOT FOUND!":
        bibcode_exists = True
        bibcode_count += 1
      else:
        bibcode_exists = False
        doi = data["doi"]
        doi_exists = True
      if bibcode_exists:
        if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
          new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
        else:
          new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
        
    #ARXIV FOUND, BIBCODE FOUND, DOI NOT FOUND --> SEARCH FOR DOI
    if data.has_key("arxiv") and data.has_key("bibcode") and not data.has_key("doi"):
      print "arxiv (OK), bibcode (OK), doi (??). Searching for doi."
      query = ADS_query(data["arxiv"], data["bibcode"], doi, item)
      arxiv = data["arxiv"]
      arxiv_exists = True
      bibcode = data["bibcode"]
      bibcode_exists = True
      doi = query[2]    
      if doi != "NOT FOUND!":
        doi_exists = True
        doi_count += 1
      else:
        doi_exists = False
      if doi_exists:
        if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
          new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
        else:
          new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
    
    #ARXIV NOT FOUND, BIBCODE NOT FOUND, DOI FOUND --> SEARCH FOR ARXIV AND BIBCODE
    if not data.has_key("arxiv") and not data.has_key("bibcode") and data.has_key("doi"):
      print "arxiv (??), bibcode (??), doi (OK). Searching for arxiv and bibcode."
      query = ADS_query(arxiv, bibcode, data["doi"], item)
      arxiv = query[0]
      if arxiv != "NOT FOUND!":
        arxiv_exists = True
        arxiv_count += 1
      else:
        arxiv_exist = False
      bibcode = query[1]
      if bibcode != "NOT FOUND!":
        bibcode_exists = True
        bibcode_count += 1
      else:
        bibcode_exists = False
      doi = data["doi"] 
      doi_exists = True
      if arxiv_exists:
        if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
          new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
        else:
          new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
      if bibcode_exists:
        if new_str != None:
          if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", new_str):
            new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, new_str)
          else:
           new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, new_str)
        else:
          if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
            new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
          else:
            new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)

    #ARXIV FOUND, BIBCODE NOT FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI
    if data.has_key("arxiv") and not data.has_key("bibcode") and not data.has_key("doi"):
      print "arxiv (OK), bibcode (??), doi (??). Searching for bibcode and doi."
      query = ADS_query(data["arxiv"], bibcode, doi, item)
      arxiv = data["arxiv"]
      arxiv_exists = True
      bibcode = query[1]
      if bibcode != "NOT FOUND!":
        bibcode_exists = True
        bibcode_count += 1
      else:
        bibcode_exists = False
      doi = query[2]
      if doi != "NOT FOUND!":
        doi_exists = True
        doi_count += 1
      else:
        doi_exists = False
      if bibcode_exists:
        if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
          new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
        else:
          new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
      if doi_exists:
        if new_str != None:
          if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
            new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
          else:
            new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
        else:
          if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
            new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
          else:
            new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)

    #ARXIV NOT FOUND, BIBCODE FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI
    if not data.has_key("arxiv") and data.has_key("bibcode") and not data.has_key("doi"):
      print "arxiv (??), bibcode (OK), doi (??). Searching for bibcode and doi."
      query = ADS_query(arxiv, data["bibcode"], doi, item)
      arxiv = query[0]
      if arxiv != "NOT FOUND!":
        arxiv_exists = True
        arxiv_count += 1
      else:
        arxiv_exist = False
      bibcode = data["bibcode"]
      bibcode_exists = True
      doi = query[2]
      if doi != "NOT FOUND!":
        doi_exists = True
        doi_count += 1
      else:
        doi_exists = False
      if arxiv_exists:
        if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
          new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
        else:
          new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
      if doi_exists:
        if new_str != None:
          if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
            new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
          else:
            new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
        else:
          if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
            new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
          else:
            new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
            
    #ARXIV NOT FOUND, BIBCODE NOT FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI
    if not data.has_key("arxiv") and not data.has_key("bibcode") and not data.has_key("doi"):
      print "arxiv (??), bibcode (??), doi (??). Guessing bibcode..."
      query = ADS_query(arxiv, bibcode, doi, item)
      arxiv = query[0]
      if arxiv != "NOT FOUND!":
        arxiv_exists = True
        arxiv_count += 1
      else:
        arxiv_exist = False
      bibcode = query[1]
      if bibcode != "NOT FOUND!":
        bibcode_exists = True
        bibcode_count += 1
      else:
        bibcode_exists = False
      doi = query[2]
      if doi != "NOT FOUND!":
        doi_exists = True
        doi_count += 1
      else:
        doi_exists = False
      if arxiv_exists:
        if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
          new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
        else:
          new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
      if bibcode_exists:
        if new_str != None:
          if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", new_str):
            new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, new_str)
          else:
            new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, new_str)
        else:
          if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
            new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
          else:
            new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
      if doi_exists:
        if new_str != None:
          if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
            new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
          else:
            new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
        else:
          if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
            new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
          else:
            new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
      
    if new_str:
      text = re.sub(re.escape(old_item),new_str,text)
    else:
      text = re.sub(re.escape(old_item),item,text)
    if unknown_journal_list is "None":
      if unknown_journal is not "None":
        unknown_journal_list = "\nUnknown journal(s) for " + page.title() + ":\n   *" + unicode(unknown_journal) + "\n"
    else:
      if unknown_journal is not "None":
        if not re.search(unicode(unknown_journal) + "\n", unknown_journal_list):
          unknown_journal_list = unknown_journal_list + "   *" + unknown_journal + "\n"
  print "\nFound:\n   " + str(counter) + " {{citation}}/{{cite journal}} template(s)\n   " +str(id_to_arxiv_count) + " '|id={{arxiv|...}}' to convert to '|arxiv=...'\n   " +str(arxiv_count) + " new arxiv eprint(s)\n   " + str(bibcode_count) + " new bibcode(s)\n   " + str(doi_count) + " new doi(s)."
  if unknown_journal_list is "None":
    print "\nUnknown journals:\n   *None"
  else:
    print unknown_journal_list
    f = open("_Unknown_journals.txt", "a")
    f.write(unknown_journal_list.encode("utf-8"))
    f.close()
  return text

            
def findalltemplates(t):
  f = []
  lowertext = t.lower()
  while re.search("{{\s*(cite|cite study|citation|cite journal|c journal|cite magazine|cite magazine article|cite paper|citejournal|citepaper|vcite paper)\s*\|", lowertext) != None:
    firstoffset = re.search("{{\s*(cite|cite study|citation|cite journal|c journal|cite magazine|cite magazine article|cite paper|citejournal|citepaper|vcite paper)\s*\|", lowertext).start()
    lastoffset = firstoffset
    counter = 1
    while counter > 0:
      nextbracket = regex.search(lowertext, lastoffset+1)
      if nextbracket.group(0) == "{{":
        counter += 1
        lastoffset = nextbracket.end()
      elif nextbracket.group(0) == "}}":
        counter -= 1
        lastoffset = nextbracket.end()
    f.append(t[firstoffset:lastoffset])
    t = t[lastoffset:]
    lowertext = lowertext[lastoffset:]
  return f

def queryADS(url):
  retry = True
  timeout = max(1, throttle_time)
  retrynum = 0
  while retry:
    try:
      rawdata = urllib2.urlopen(url).read()
      retry = False
    except urllib2.URLError:
      retrynum += 1
      timeout = retrynum * throttle_time
      if retrynum > 3:
        print "Cannot connect to ADS site.  Aborting..."
        return ""
      print "\nError connecting to ADS site.  Retrying in " + str(timeout) + " seconds."
      time.sleep(timeout)
      continue
  return rawdata

def adv_check_bibcode(code): #Try to find a valid author / section code
  if code:
    alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ."
    url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX"
    for i in range(27):
      url += "&bibcode=" + urllib.quote(code[:-1]) + alpha[i]
      url += "&db_key=ALL"
    print "Probing for a new author..."
    raw_html = queryADS(url)
    bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
    if bibcode_check:
      print "   Found! " + bibcode_check[0]
      return raw_html
    else:
      print "   Not found!"
      alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
      alphalower = "abcdefghijklmnopqrstuvwxyz"
      url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=PLAINTEXT"
      for i in range(26):
        url += "&bibcode=" + urllib.quote(code[:-1]) + alpha[i]
      print "Probing for a new section..."
      raw_html = urllib2.urlopen(url).read()
      bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
      if bibcode_check:
        print "   Found! " + bibcode_check[0]
        return raw_html
      else:
        url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=PLAINTEXT"
        for i in range(26):
          url += "&bibcode=" + urllib.quote(code[:-1]) + alphalower[i]
        raw_html = queryADS(url)        
        if bibcode_check:
          print "   Found! " + bibcode_check[0]
          return raw_html
        else:
          print "   Not found!"
          return "Dummy text"  

def ADS_query(arxiv, bibcode, doi, item):
  arxiv_match = False
  bibcode_match = False
  doi_match = False
  raw_html = "Dummy text"
  pairs = re.finditer(u"(?P<key>\w+)\s*?=\s*?(?P<value>.*?)(\n|\||\})", item)
  data = {}
  for pair in pairs:
    key = pair.group("key").strip()
    value = pair.group("value").strip(" []")
    value = re.sub(r"<!--.*?-->", "", value, flags=re.DOTALL)
    if len(value)>0:
      data[key] = value
  if not arxiv and not bibcode and not doi:
    bibcode_guess = get_bibcode(data)
    if bibcode_guess:
      print "Bibcode guess: " + bibcode_guess
      url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode_guess.encode("utf-8"))
      raw_html = queryADS(url)
      bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
      if bibcode_check:
        print "   Valid!"
      else:
        print "   Invalid!"
        raw_html = adv_check_bibcode(bibcode_guess)        
  if arxiv and not bibcode and not doi:
    url = "http://adsabs.harvard.edu/abs/arXiv:" + urllib.quote(arxiv)
    raw_html = queryADS(url)
    bibcode_check = re.findall("<A href=\"http://adsabs\.harvard\.edu/abs/(...................)\">", raw_html, flags=re.IGNORECASE)
    if bibcode_check:
      print "   Found bibcode by arxiv query!" + bibcode_check[0]
    else:
      print "   Did not find bibcode by arxiv query! Guessing bibcode..."
      bibcode_guess = get_bibcode(data)
      if bibcode_guess:
        print "Bibcode guess: " + bibcode_guess
        url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode_guess.encode("utf-8"))
        raw_html = queryADS(url)
        bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
        if bibcode_check:
          print "   Valid!"
        else:
          print "   Invalid!"
          raw_html = adv_check_bibcode(bibcode_guess)
  if bibcode:
    url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode)
    raw_html = queryADS(url)
  else:
    if doi:
      url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&doi=" + urllib.quote(doi.encode("utf-8"))
      raw_html = queryADS(url)
  arxiv_match_0 = re.findall("eprint = {arXiv:(.*)}", raw_html)
  arxiv_match_1 = re.findall("\(arXiv:(.*)\)", raw_html)
  bibcode_match_0 = re.findall("@ARTICLE{(...................)", raw_html)
  bibcode_match_1 = re.findall("<A href=\"http://adsabs\.harvard\.edu/abs/(...................)\">", raw_html, flags=re.IGNORECASE)
  doi_match_0 = re.findall("doi = {(.*?)}", raw_html)
  doi_match_1 = re.findall("<A href=\"http://dx\.doi\.org/(.*)\">", raw_html, flags=re.IGNORECASE)
  if not arxiv_match_0 and not arxiv_match_1 and not bibcode_match_0 and not bibcode_match_1 and not doi_match_0 and not doi_match_1:
    return ("NOT FOUND!", "NOT FOUND!", "NOT FOUND!")
  else:
    print "Query results:"
  if arxiv_match_0:
    arxiv_match = arxiv_match_0[0]
    print "   arxiv  : " + arxiv_match
  if arxiv_match_1:
    arxiv_match = arxiv_match_1[0]
    print "   arxiv  : " + arxiv_match
  if not arxiv_match:
    arxiv_match = "NOT FOUND!"
    print "   arxiv  : NOT FOUND!"
  if bibcode_match_0:
    bibcode_match = bibcode_match_0[0]
    print "   bibcode: " + bibcode_match
  if bibcode_match_1:
    bibcode_match = bibcode_match_1[0]
    print "   bibcode: " + bibcode_match
  if not bibcode_match:
    bibcode_match = "NOT FOUND!"
    print "   bibcode: NOT FOUND!"
  if doi_match_0:
    doi_match = doi_match_0[0]
    print "   doi    : " + doi_match
  if doi_match_1:
    doi_match = doi_match_1[0]
    print "   doi    : " + doi_match
  if not doi_match:
    doi_match = "NOT FOUND!"
    print "   doi    : NOT FOUND!"
  return (arxiv_match, bibcode_match, doi_match)  

if __name__ == "__main__":
  main()