User:QiuLiming-bot/stubremovecode
外观
Wikipedia:机器人/作业请求 Wikipedia:机器人/申请
QiuLiming-bot允許任何人不經同意就編輯此頁面。 |
from bs4 import BeautifulSoup
from urllib.parse import unquote
import requests
import time
import string
import grapheme
import re
from wpinfo import * # 账号数据
clear_file = False
debug = True
start_page = int(input('Start from page(will clear file): ' + str(clear_file)))
print(start_page)
if clear_file:
open('log.txt', 'w').write('')
open('log.txt', 'r+').truncate(0)
open('chlog.txt', 'w').write('')
open('chlog.txt', 'r+').truncate(0)
def is_chinese(char):
return '\u4e00' <= char<= '\u9fa5' # segmentfault.com
def getLen(s):
return grapheme.length(''.join([x for x in s if is_chinese(x)]))
def getTotalCount(page_content, limit = 200*1.25):
b = BeautifulSoup(page_content, 'lxml')
count = 0
for p in b.select('#mw-content-text > .mw-parser-output > p'):
reflen = len(p.select('sup.reference'))
for i in range(reflen):
try: p.select('sup.reference')[i].decompose()
except: pass
nplen = len(p.select('span.noprint'))
for i in range(nplen):
try: p.select('span.noprint')[i].decompose()
except: pass
count += getLen(p.text)
# open('log.txt', 'a', encoding = 'utf-8').write('{} : {}\n'.format(b.title.text, count))
return count
if __name__ == '__main__':
S = requests.session()
url = "https://zh.wikipedia.org/w/api.php"
resturl = lambda x: 'https://zh.wikipedia.org/w/rest.php/v1/page/' + x
loginTokenP = {"action": "query", "meta": "tokens","type": "login", "format": "json"}
loginToken = S.get(url, params=loginTokenP).json()['query']['tokens']['logintoken']
loginP = {"action": "login", "lgname": bot1_name, "lgpassword": bot1_pass, "lgtoken": loginToken, "format": "json"}
r1 = S.post(url, data=loginP)
tokenP = {"action": "query", "meta": "tokens", "format": "json", type:'csrf'}
token = S.get(url, params=tokenP).json()['query']['tokens']['csrftoken']
stubCatName = 'Category:全部小作品'
cmcontinue = ''
curpage = 0
while True:
stubCatP = {'action':'query', 'list': 'categorymembers', 'cmtitle': stubCatName, 'cmlimit': 'max', 'cmnamespace': '0', 'format': 'json', 'cmcontinue': cmcontinue}
cmresp = S.get(url, params = stubCatP).json()
cmlist = cmresp['query']['categorymembers']
Exit = False
if 'continue' in cmresp.keys() and 'cmcontinue' in cmresp['continue'].keys():
cmcontinue = cmresp['continue']['cmcontinue']
else:
Exit = True
if start_page > curpage:
# open('pagemax.txt', 'w').write('SP:' + str(start_page))
curpage += 1
continue
for i in cmlist:
title = i['title']
# print(title, flush=True, end=' ')
content = requests.get('https://zh.wikipedia.org/wiki/' + title).content
total = getTotalCount(content)
open('log.txt', 'a', encoding='utf-8').write('{} : {}字\n'.format(title, total))
if total > 200 * 1.25:
if not debug:
time.sleep(10)
wikisource = requests.get(resturl(title)).json()['source']
text = re.sub(r'\n?{{.*?([ -]stub|小作品|)}}', "", wikisource)
editP = {'action':'edit', 'title':title, 'token': token, 'format':'json', 'text': text, 'bot': 'y'}
print('已经编辑', title)
R = S.post(url, data=editP)
time.sleep(10)
else:
open('chlog.txt', 'a', encoding='utf-8').write("| {} || {}\n|-\n".format(title, total))
# time.sleep(2)
curpage += 1
open('pagemax.txt', 'w').write(str(curpage))
if Exit:
exit()