用戶:Hillgentleman/ifexists
外觀
中文維基百科中引用 #ifexists 超過一百次之頁
參攷:meta:help talk:page existence, http://noc.wikimedia.org/~tstarling/ifexist.log
#ifexistslog.py # TO FETCH lines like: # 2007-12-03 06:27:16 zh_yuewiki: 131 http://zh-yue.wikipedia.org/wiki/%E6%B4%9B%E7%A3%AF%E5%B1%B1%E8%84%88 #FROM http://noc.wikimedia.org/~tstarling/ifexist.log a = '' #import re import urllib import codecs import time #urlX = re.compile(r'http\://zh-yue.+\b',flags=re.U) file = urllib.urlopen('http://noc.wikimedia.org/~tstarling/ifexist.log') saveFile = codecs.open('ifexists.log.1','a+',encoding='utf-8') x=file.read() saveFile.write(x) saveFile.close()
#ifexistslog-sort-zh.py
# TO SORT THE FILE ifexists.log.1 AND EXTRACT lines like:
# 2007-12-03 06:27:16 zhwiki: 131 http://zh.wikipedia.org/wiki/%E6%B4%9B%E7%A3%AF%E5%B1%B1%E8%84%88
#FROM http://noc.wikimedia.org/~tstarling/ifexist.log
print 'ifexistslog-sort-zh.py'
a = ''
import re
import codecs
#import urllib
#import time
## REGEX
urlX = re.compile(r'http\://zh.+\b',flags=re.U)
andX= re.compile(r'&.*\b',flags=re.U) #REMOVE &variant=zh-tw blablabla TAIL
shorturlX=re.compile(r'w/index\.php\?title=',flags=re.U) #CONVERT TO SHORTURL
#file = urllib.urlopen('http://noc.wikimedia.org/~tstarling/ifexist.log')
logFile = codecs.open('ifexists.log.1','a+',encoding='utf-8')
dataFile = codecs.open('ifexists.sort.zh.1','a+',encoding='utf-8')
x=logFile.read()
list = urlX.findall(x)
"""
n=0
for i in list:
print(i)
n+=1
print n, 'parses found.'
"""
s = set(list)
"""n=0
for i in s:
print i
n+=1
print n, 'pages found.'
"""
a = raw_input('haha\n\n\n\n\n\a')
## TREATING THE URLS AND HOPEFULLY FURTHER REDUCING THE SET
s1=set([])
for i in s:
i = andX.sub('',i) #REMOVE THE &... TAIL
i = shorturlX.sub('wiki/',i) #REPLACE BY SHORTURL
print i
s1.add(i)
n=0
for i in s1:
print(i)
if a=='': a=raw_input('press return to continue, something else to automate')
n+=1
dataFile.write(i+'\n')
print n,'urls in total.'
dataFile.close()
"""
try:
while True:
x = file.read(100000)
print x
if a=='': a=raw_input('press Return to continue to wait, or press some other key to automatise')
saveFile.write(x)
currentTime=time.clock()
while time.clock()< currentTime+10:
print'..'
finally:
saveFile.close()
"""