Wikiproyecto:Bots/Repositorio/ranking-visitas.py

actualizar · discusión · código desprotegido  

Información de fichero
  • Nombre del fichero: ranking-visitas.py
  • Lenguaje: Python
  • Estado: no protegido
Detalles de edición
  • Detalles:
Script de BOTijo (disc. · contr. · bloq.) para actualizar Wikipedia:Ranking de visitas. Originalmente tarea018.py
# -*- coding: utf-8 -*-

# Copyright (C) 2009 emijrp
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import wikipedia, gzip, os, re, datetime, sys
import urllib
import time
import pagegenerators

import tareas

limite=100
if len(sys.argv)>1:
    limite=int(sys.argv[1])

langs=['es']
exitpage=u'User:BOTijo/Sandbox'

index='/home/emijrp/temporal/tmpweb.html'
os.system('wget http://dammit.lt/wikistats/ -O %s' % index)
f=open(index, 'r')
wget=f.read()
f.close()
ayer=datetime.date.today()-datetime.timedelta(days=1)
ayerano=str(ayer.year)
ayermes=str(ayer.month)
if len(ayermes)==1:
    ayermes='0%s' % ayermes
ayerdia=str(ayer.day)
if len(ayerdia)==1:
    ayerdia='0%s' % ayerdia
#m=re.compile(ur'(?i)\"(pagecounts\-%s%s%s\-\d{6}\.gz)\"' % (ayerano, ayermes, ayerdia)).finditer(wget)
m=re.compile(ur'(?i)\"(pagecounts\-20081201\-\d{6}\.gz)\"').finditer(wget)
gzs=[]
for i in m:
    print i.group(1)
    gzs.append(i.group(1))
wikipedia.output("Elegidos %d ficheros..." % len(gzs))

pagesdic={}
namespaceslists={}
exceptions={}
for lang in langs:
    namespaceslists[lang]=tareas.getNamespacesList(wikipedia.Site(lang, 'wikipedia'))
    exceptions[lang]={}
    exceptions[lang]['raw']='|'.join(namespaceslists[lang])
    exceptions[lang]['compiled']=re.compile(r'(?i)(%s)\:' % exceptions[lang]['raw'])

wikipedia.output("Se van a analizar los idiomas: %s" % ', '.join(langs))
for lang in langs:
    wikipedia.output("Excepciones de %s: %s" % (lang, exceptions[lang]['raw']))

for gz in gzs:
    print gz
    try:
        f=gzip.open('/mnt/user-store/stats/%s' % gz, 'r')
    except:
        os.system('wget http://dammit.lt/wikistats/%s -O /mnt/user-store/stats/%s' % (gz, gz))
        f=gzip.open('/mnt/user-store/stats/%s' % gz, 'r')
   
    #regex=re.compile(ur'(?im)^([a-z]{2}) (.*?) (\d{1,}) (\d{1,})$') #evitamos aa.b
    regex=re.compile(r'(?im)^(?P<pagelang>%s) (?P<page>.+) (?P<times>\d{1,}) (?P<other>\d{1,})$' % '|'.join(langs)) #evitamos aa.b
   
    c=analized=errores=0
    for line in f:
        line=line[:len(line)-1]
        try:
            line=line.encode('utf-8')
            line=urllib.unquote(line)
        except:
            try:
                line=urllib.unquote(line)
            except:
                wikipedia.output(line)
                errores+=1
                continue
        c+=1
        if c % 250000 == 0:
            print "Leidas %d lineas (%d analizadas, %d errores)" % (c, analized, errores)
            print "%d idiomas" % len(pagesdic.items())
            cc=0
            for proj, projpages in pagesdic.items():
                cc+=1
                if cc<=10:
                    print "  %d) %s.wikipedia.org" % (cc, proj)
                else:
                    print "    Y algunos mas..."
                    break
       
        m=regex.finditer(line)
        for i in m:
            pagelang=i.group('pagelang')
            page=re.sub('_', ' ', i.group('page'))
           
            #if re.search(exceptions[pagelang]['compiled'], page):
            #    continue
           
            times=int(i.group('times'))
            other=int(i.group('other'))
           
            #lang
            if not pagesdic.has_key(pagelang):
                pagesdic[pagelang]={}
           
            #page
            if pagesdic[pagelang].has_key(page):
                pagesdic[pagelang][page]+=times
            else:
                pagesdic[pagelang][page]=times
                analized+=1
    break
    f.close()

#ordenamos de mas visitas a menos, cada idioma
pageslist={}
cc=0
for lang, pages in pagesdic.items():
    cc+=1
    print "Ordenando %s.wikipedia.org [%d/%d]" % (lang, cc, len(pagesdic.items()))
    pageslist[lang] = [(visits, page) for page, visits in pages.items()]
    pageslist[lang].sort()
    pageslist[lang].reverse()
    pageslist[lang] = [(page, visits) for visits, page in pageslist[lang]]

totalvisits={}
for lang, pages in pageslist.items():
    if not totalvisits.has_key(lang):
        totalvisits[lang]=0
    for page, visits in pages:
        totalvisits[lang]+=visits

pageselection={}
for lang, pages in pageslist.items():
    c=0
    pageselection[lang]=[]
    for page, visits in pages:
        if re.search(ur'(?im)(Special\:|sort_down\.gif|sort_up\.gif|sort_none\.gif|\&limit\=)', page): #ampliar con otros idiomas
            continue
       
        c+=1
        if c<=limite*2: #margen de error, pueden no existir las paginas, aunque seria raro
            pageselection[lang].append([urllib.quote(page), visits])
        else:
            break

for lang, list in pageselection.items():
    projsite=wikipedia.Site(lang, 'wikipedia')
    salida="{{/begin|%d|%s|%d}}" % (limite, lang, totalvisits[lang])
   
    list2=[]
    for quotedpage, visits in list:
        list2.append(quotedpage)
    gen=pagegenerators.PagesFromTitlesGenerator(list2, projsite)
    pre=pagegenerators.PreloadingGenerator(gen, pageNumber=limite, lookahead=10)
    c=0
    ind=-1
    for page in pre:
        detalles=u''
        ind+=1
        if page.exists():
            wtitle=page.title()
           
            if page.isRedirectPage():
                detalles+=u'(#REDIRECT [[%s]]) ' % (page.getRedirectTarget().title())
            elif page.isDisambig():
                detalles+=u'(Desambiguación) '
            else:
                pass
                """tmpget=page.get()
                if re.search(ur'(?i)\{\{ *Artículo bueno', tmpget):
                    detalles+='[[Image:Artículo bueno.svg|14px|Artículo bueno]]'
                if re.search(ur'(?i)\{\{ *(Artículo destacado|Zvezdica)', tmpget):
                    detalles+='[[Image:Cscr-featured.svg|14px|Featured article]]'
                if re.search(ur'(?i)\{\{ *(Semiprotegida2?|Semiprotegido|Pp-semi-template)', tmpget):
                    detalles+='[[Image:Padlock-silver-medium.svg|20px|Semiprotegida]]'
                if re.search(ur'(?i)\{\{ *(Protegida|Protegido|Pp-template)', tmpget):
                    detalles+='[[Image:Padlock.svg|20px|Protegida]]'"""
           
            wikipedia.output('%s - %d - %s' % (wtitle, visits, detalles))
            #continue
           
            if page.namespace() in [6, 14]:
                wtitle=u':%s' % wtitle
            c+=1
            salida+="\n|-\n| %d || [[%s]] %s|| %s " % (c, wtitle, detalles, list[ind][1])
            if c>=limite:
                break
            #except:
            #    wikipedia.output(u'Error al generar item en lista de %s:' % lang)
   
    iws=u''
    for iw in langs:
        if iw!=lang:
            iws+=u'[[%s:%s]]\n' % (iw, exitpage)
    salida+="\n{{/end}}\n%s" % (iws)
    wikipedia.output(salida)
    wiii=wikipedia.Page(projsite, exitpage)
    wiii.put(salida, u'BOT - Updating list')