跳转到内容

用户:QiuLiming-bot/stubremovecode

维基百科,自由的百科全书

Wikipedia:机器人/作业请求 Wikipedia:机器人/申请

from bs4 import BeautifulSoup
from urllib.parse import unquote
import requests
import time
import string
import grapheme
import re
from wpinfo import * # 账号数据
clear_file = False
debug = True
start_page = int(input('Start from page(will clear file): ' + str(clear_file)))
print(start_page)
if clear_file:
    open('log.txt', 'w').write('')
    open('log.txt', 'r+').truncate(0)
    open('chlog.txt', 'w').write('')
    open('chlog.txt', 'r+').truncate(0)

def is_chinese(char):
    return '\u4e00' <= char<= '\u9fa5' # segmentfault.com
def getLen(s):
    return grapheme.length(''.join([x for x in s if is_chinese(x)]))
def getTotalCount(page_content, limit = 200*1.25):
    b = BeautifulSoup(page_content, 'lxml')
    count = 0
    for p in b.select('#mw-content-text > .mw-parser-output > p'):
            reflen = len(p.select('sup.reference'))
            for i in range(reflen):
                try: p.select('sup.reference')[i].decompose()
                except: pass
            nplen = len(p.select('span.noprint'))
            for i in range(nplen):
                try: p.select('span.noprint')[i].decompose()
                except: pass
            count += getLen(p.text)
    # open('log.txt', 'a', encoding = 'utf-8').write('{} : {}\n'.format(b.title.text, count))
    return  count
if __name__ == '__main__':
    S = requests.session()
    url = "https://zh.wikipedia.org/w/api.php"
    resturl = lambda x:  'https://zh.wikipedia.org/w/rest.php/v1/page/' + x
    loginTokenP = {"action": "query", "meta": "tokens","type": "login", "format": "json"}
    loginToken = S.get(url, params=loginTokenP).json()['query']['tokens']['logintoken']
    loginP = {"action": "login", "lgname": bot1_name, "lgpassword": bot1_pass, "lgtoken": loginToken, "format": "json"}
    r1 = S.post(url, data=loginP)
    tokenP = {"action": "query", "meta": "tokens", "format": "json", type:'csrf'}
    token = S.get(url, params=tokenP).json()['query']['tokens']['csrftoken']
    stubCatName = 'Category:全部小作品'
    cmcontinue = ''
    curpage = 0
    while True:
        stubCatP = {'action':'query', 'list': 'categorymembers', 'cmtitle': stubCatName, 'cmlimit': 'max', 'cmnamespace': '0', 'format': 'json', 'cmcontinue': cmcontinue}
        cmresp =  S.get(url, params = stubCatP).json()
        cmlist = cmresp['query']['categorymembers']
        Exit = False
        if 'continue' in cmresp.keys() and 'cmcontinue' in cmresp['continue'].keys():
            cmcontinue = cmresp['continue']['cmcontinue']
        else:
            Exit = True
        if start_page > curpage:
            # open('pagemax.txt', 'w').write('SP:' + str(start_page))
            curpage += 1
            continue
        for i in cmlist:
            title = i['title']
            # print(title, flush=True, end=' ')
            content = requests.get('https://zh.wikipedia.org/wiki/' + title).content
            total = getTotalCount(content)
            open('log.txt', 'a', encoding='utf-8').write('{} : {}\n'.format(title, total))
            if total > 200 * 1.25: 
                if not debug:
                    time.sleep(10)
                    wikisource = requests.get(resturl(title)).json()['source']
                    text = re.sub(r'\n?{{.*?([ -]stub|小作品|)}}', "", wikisource)
                    editP = {'action':'edit', 'title':title, 'token': token, 'format':'json', 'text': text, 'bot': 'y'}
                    print('已经编辑', title)
                    R = S.post(url, data=editP)
                    time.sleep(10)
                else:
                    open('chlog.txt', 'a', encoding='utf-8').write("| {} || {}\n|-\n".format(title, total))
            # time.sleep(2)
        curpage += 1
        open('pagemax.txt', 'w').write(str(curpage))
        if Exit:
            exit()