跳转到内容

User:Hillgentleman/ifexists

维基百科,自由的百科全书

中文維基百科中引用 #ifexists 超過一百次之頁

參攷:meta:help talk:page existence, http://noc.wikimedia.org/~tstarling/ifexist.log

#ifexistslog.py
# TO FETCH lines like:
#          2007-12-03 06:27:16 zh_yuewiki: 131 http://zh-yue.wikipedia.org/wiki/%E6%B4%9B%E7%A3%AF%E5%B1%B1%E8%84%88
#FROM  http://noc.wikimedia.org/~tstarling/ifexist.log

a = ''

#import re
import urllib
import codecs
import time

#urlX = re.compile(r'http\://zh-yue.+\b',flags=re.U)

file = urllib.urlopen('http://noc.wikimedia.org/~tstarling/ifexist.log')
saveFile = codecs.open('ifexists.log.1','a+',encoding='utf-8')

x=file.read()
saveFile.write(x)
saveFile.close()
#ifexistslog-sort-zh.py
# TO SORT THE FILE ifexists.log.1 AND EXTRACT lines like:
#          2007-12-03 06:27:16 zhwiki: 131 http://zh.wikipedia.org/wiki/%E6%B4%9B%E7%A3%AF%E5%B1%B1%E8%84%88
#FROM  http://noc.wikimedia.org/~tstarling/ifexist.log

print 'ifexistslog-sort-zh.py'

a = ''

import re
import codecs
#import urllib
#import time


## REGEX
urlX = re.compile(r'http\://zh.+\b',flags=re.U)
andX= re.compile(r'&.*\b',flags=re.U) #REMOVE &variant=zh-tw blablabla TAIL
shorturlX=re.compile(r'w/index\.php\?title=',flags=re.U) #CONVERT TO SHORTURL


#file = urllib.urlopen('http://noc.wikimedia.org/~tstarling/ifexist.log')
logFile = codecs.open('ifexists.log.1','a+',encoding='utf-8')
dataFile = codecs.open('ifexists.sort.zh.1','a+',encoding='utf-8')

x=logFile.read()

list = urlX.findall(x)

"""
n=0
for i in list:
  print(i)
  n+=1
print n, 'parses found.'
"""

s = set(list)
"""n=0
for i in s:
 print i
 n+=1
print n, 'pages found.'
"""

a = raw_input('haha\n\n\n\n\n\a')

## TREATING THE URLS AND HOPEFULLY FURTHER REDUCING THE SET

s1=set([])
for i in s:
 i = andX.sub('',i) #REMOVE THE &... TAIL
 i = shorturlX.sub('wiki/',i) #REPLACE BY SHORTURL 
 print i
 s1.add(i)

n=0
for i in s1:
  print(i)
  if a=='': a=raw_input('press return to continue, something else to automate')
  n+=1
  dataFile.write(i+'\n')

print n,'urls in total.'
dataFile.close()

"""

try:
  while True:
    x = file.read(100000)
    print x

    if a=='': a=raw_input('press Return to continue to wait, or press some other key to automatise')

    saveFile.write(x)
    
    currentTime=time.clock()
    while time.clock()< currentTime+10:
      print'..'

finally:
    saveFile.close()

"""