Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse all talkpages #71

Merged
merged 2 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ This version of the scripts uses python3, which unfortunately broke our old wiki
- `missing_categories.py`: Searches for non-translated categories. Categories which are only in english should generally be marked as {{non-article category}}.
- `missing_translations.py`: Generates the list of missing translations for each language compared to english, which is used by the translator's noticeboard
- `untranslated_templates.py`: Parses templates for {{lang}} usage, and reports whether or not they are fully translated.
- `active_discussions.py`: Searches for recent edits to talk namespaces, and reports on activity based on the number of editors.

## Weekly reports
- `displaytitles_weekly.py`: Weekly copy of the monthly report which only runs on the past week of recent changes.
Expand Down
73 changes: 73 additions & 0 deletions active_discussions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from datetime import datetime, timedelta
from utils import pagescraper_queue, time_and_date
from wikitools import wiki

verbose = False
one_month_ago = datetime.utcnow() - timedelta(days=30)
one_week_ago = datetime.utcnow() - timedelta(days=7)
KNOWN_BOTS = ['WelcomeBOT'] # We only need to list bots which post to talkpages.

def pagescraper(page, active_one_week, active_one_month):
if verbose:
print(f'Fetching revisions for {page}')
weekly_users = set()
monthly_users = set()
for revision in page.get_revisions(one_month_ago):
if revision['user'] in KNOWN_BOTS:
continue
if revision['timestamp'] > one_week_ago:
weekly_users.add(revision['user'])
monthly_users.add(revision['user'])
elif revision['timestamp'] > one_month_ago:
monthly_users.add(revision['user'])

# A discussion is considered 'active' if it has any user in the past week, or more than 3 users in the past month.
if len(weekly_users) >= 1:
active_one_week.append(page)
elif len(monthly_users) >= 3:
active_one_month.append(page)

def main(w):
namespaces = [ns for ns in w.namespaces if 'talk' in ns.lower()]

recent_pages = set()
for page in w.get_recent_changes(one_month_ago, namespaces=namespaces):
recent_pages.add(page)
if verbose:
print(f'Found {len(recent_pages)} recently modified talkpages in the past month')

active_one_week = []
active_one_month = []
with pagescraper_queue(pagescraper, active_one_week, active_one_month) as pages:
for page in recent_pages:
pages.put(page)
if verbose:
print(f'Found {len(active_one_week)} active discussions this week')
print(f'Found {len(active_one_month)} active discussions this month')

output = """\
{{{{DISPLAYTITLE: {count} active discussions}}}}
There are '''<onlyinclude>{count}</onlyinclude>''' active discussions as of {date}.

""".format(
count=len(active_one_week + active_one_month),
date=time_and_date())

active_one_week.sort()
output += '== Active talk pages in the past week ==\n'
for page in active_one_week:
output += f'* [[{page}]]\n'

active_one_month.sort()
output += '== Active talk pages in the past month ==\n'
for page in active_one_month:
output += f'* [[{page}]]\n'

return output

if __name__ == '__main__':
verbose = True
w = wiki.Wiki('https://wiki.teamfortress.com/w/api.php')
with open('wiki_all_articles.txt', 'w') as f:
f.write(main(w))
print(f'Article written to {f.name}')
6 changes: 2 additions & 4 deletions master.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,15 @@
# {{lang}} template mis-ordering and lang-template duplicate keys
# Templates sorted by usage and protect status
# A 'missing translations' report but for dictionary entries (maybe sorted by usage, too?)
# A report for "Edits on talkpages (not in the "user talk" namespace) in the past few days", so people can track active discussions?
# Templates which have redirects in them

# Reports I want to improve:
# update readme (again)
# Consider running some scripts against the Help: namespace, too
# (like what? miscategorized, mismatched, uhhh)
# Sort missing categories by # pages
# Sort the output from mismatched
# Sort the output from displaytitles
# Threading for navboxes.py?
# Ensure that PRs which add files also touch readme.md -> isn't this done?
# Templates which link to redirects

def edit_or_save(page_name, file_name, output, summary):
wiki_diff_url = Page(w, page_name).edit(output, bot=True, summary=summary)
Expand Down Expand Up @@ -68,6 +65,7 @@ def publish_report(w, module, report_name, root, summary):
'missing_categories': 'Untranslated categories',
'missing_translations': 'Missing translations',
'untranslated_templates': 'Untranslated templates',
'active_discussions': 'Active discussions',
}

# English-only but otherwise frequently changing reports
Expand Down
15 changes: 15 additions & 0 deletions wikitools/page.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datetime import datetime
from time import sleep
import functools
import requests
Expand Down Expand Up @@ -107,6 +108,20 @@ def get_file_link_count(self):
# Also, this report uses page IDs for iteration, so for now we're returning solely based on the first page of results.
return html.count('mw-whatlinkshere-tools') # Class for (<-- links | edit)

def get_revisions(self, starttime, rvprop='user|timestamp'):
for data in self.wiki.get_with_continue('query', 'pages',
prop='revisions',
titles=[self.url_title],
rvprop=rvprop,
rvlimit=500,
rvdir='older', # Default, list from newest to oldest
):
for revision in data['revisions']:
revision['timestamp'] = datetime.strptime(revision['timestamp'], '%Y-%m-%dT%H:%M:%SZ')
if revision['timestamp'] < starttime:
return
yield revision

def edit(self, text, summary, bot=True):
if len(text) > 3000 * 1000: # 3 KB
text = '<span class="error">Warning: Report truncated to 3 KB</span>\n' + text[:3000 * 1000]
Expand Down
Loading