Código para llevar: Listas de palabras de Wikitionary

Como experimento y para probar la nueva versión de Pywc, nació (más bien se escribió) un script que usa Wikitionary (el diccionario de la Wikipedia) para extraer listas de palabras.

La sintaxis es:

./wikiwords.py [-help] [-min <tamaño>] [-max <tamaño>] [-lang <lenguaje>] [-numpags <número>] [-incial <palabra>] [-nullwd <lista>]
-help: muestra esto
-min: las palabras de salida serán como mínimo de <tamaño> letras
-max: las palabras de salida serán como máximo de <tamaño> letras
-lang: se usara el wiki en <lenguaje>
-numpags: se leeran <número> páginas
-inicial: la palabra inicial será <palabra>
-nullwd: la lista de palabras "inútiles" (artículos) está en <lista> o no existe (off)

La lista de palabras "inútiles" de ejemplo es:

cual
cuales
de
el
la
los
las
o
y
que
se
en
un
una

El script se puede descargar aquí [wikiwords.zip] (con la lista de palabras) y con Pywc [wikiwords_03.zip].
Y el código coloreado con pygments...

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Escrito por kenkeiras [CodigoParaLlevar.blogspot.com]

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by

    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.
"""

import pywc, string, sys

argc = len(sys.argv)

def show_help():

    print "./wikiwords.py [-help] [-min <tamaÃ±o>] [-max <tamaÃ±o>] [-lang <lenguaje>] [-numpags <nÃºmero>] [-incial <palabra>] [-nullwd <lista>]"

    print "-help: muestra esto"
    print "-min: las palabras de salida serÃ¡n como mÃnimo de <tamaÃ±o> letras"
    print "-max: las palabras de salida serÃ¡n como mÃ¡ximo de <tamaÃ±o> letras"

    print "-lang: se usara el wiki en <lenguaje>"
    print "-numpags: se leeran <nÃºmero> pÃ¡ginas"
    print "-inicial: la palabra inicial serÃ¡ <palabra>"

    print "-nullwd: la lista de palabras \"inÃºtiles\" (artÃculos) estÃ¡ en <lista> o no existe (off)"

# Caracteres que componen las palabras
# Characters that compose words

global textchars
textchars = u"abcdefghijklmnÃ±opqrstuvwxyzABCDEEFGHIJKLMNÃ‘OPQRSTUVWXYZÃ¡Ã©ÃÃ³ÃºÃ¼ÃÃ‰ÃÃ“ÃšÃœ"

# Si es una simple palabra
# If is a plain word
def isClean(s):

    global textchars
    try:
        for c in unicode(s):

            if (not c in textchars):
                return False

    except Exception as ex:
        return False
    return True

# Valores por defecto
# By default values
minchars = 0 # Caracteres mÃnimos
maxchars = 100 # Caracteres mÃ¡ximos

numpags = 100 # PÃ¡ginas que se recorrerÃ¡n
inicial="meta" # Palabra inicial
lang = "es" # Lenguaje

nullwords = "nullwd" # Useless word list

# Reading command line options
# Leyendo las opciÃ³nes de linea de comandos
i = 1

while ( i < argc ):
    if ( sys.argv[i] == "-min" ): # OpciÃ³n: -min

        i += 1
        minchars = int (sys.argv[i])

    elif ( sys.argv[i] == "-max" ):# OpciÃ³n: -max

        i += 1
        maxchars = int (sys.argv[i])

    elif ( sys.argv[i] == "-lang" ):# OpciÃ³n: -lang

        i += 1
        lang = sys.argv[i]

    elif ( sys.argv[i] == "-numpags" ):# OpciÃ³n: -numpags

        i += 1
        numpags = int (sys.argv[i])

    elif ( sys.argv[i] == "-inicial" ):# OpciÃ³n: -inicial

        i += 1
        inicial = sys.argv[i]

    elif ( sys.argv[i] == "-nullwd" ):# OpciÃ³n: -nullwd

        i += 1
        nullwords = sys.argv[i]

    elif ( sys.argv[i] == "-help" ):# OpciÃ³n: -help

        show_help()
        sys.exit(0)

    i += 1

# Caracteres que separan palabras
# Characters that split words
splitchars = ",.-/+|:!?\"'\\&\n"

new = ["http://"+lang+".wiktionary.org/wiki/"+inicial] # prÃ³ximas Url's

wordlist = ['wikitionary'] # Lista de palabras que ya salieron

# Add null characters to the list
# Se aÃ±aden los caracteres nulos a la lista
if (nullwords.lower() != "off"):

    try:
        f = open(nullwords,"r")
        line = f.readline()

        while ( len (line) > 0 ):
            line = line.replace("\n","").replace("\r","").strip()

            if (len (line) > 0) and (not line in wordlist):

                wordlist.append(line)
            line = f.readline()

        f.close()
    except Exception as ex:

        print >> sys.stderr, "Error al leer archivo de palabras nulas:",ex

crawler = pywc.crawler() # Se inicia el crawler

# Comienza la acciÃ³n...

while ( len(new) > 0 ) and (numpags > 0):

    site = crawler.crawlsite( new.pop(0) ) # Se lee la pÃ¡gina

    inPost = False
    numpags -= 1

    for el in site.element_list: # Para todos los elementos de una pÃ¡gina

        if (el.tag_type == "div"):
            try:

                if ("id" in el.property) and \
                    ( el.property['id'] == "bodyContent" ): # Si es el inicio

                    inPost = True                            # del artÃculo...

                elif ( el.property['class'] == "printfooter" ): # Si es el

                    break                                     # final...
            except:
                pass

        if (inPost) and (not el.tag): # Si es parte del artÃculo

            el.text = el.text.lower()
            tmp = el.text.strip()

            for schar in splitchars: # Se separan las palabras
                tmp = tmp.split(schar)

                tmp = string.join(tmp," ")

            for word in tmp.split(" "): # Se comprueba cada palabra

                                        # por separado

                # Se comprueba que sean palabras normales, que no excedan
                # los lÃmites y que no estÃ©n repetidas
                if (isClean(word)):
                    lonx = len(word)

                    if ( lonx > 0 ) and ( lonx >= minchars ) and \
                     ( lonx <= maxchars ):

                        if (not word in wordlist):
                            wordlist.append(word)

                            print word
                            new.append("http://"+lang+".wikipedia.org/wiki/"+\
                                word)

Código para llevar

jueves, 8 de julio de 2010

Listas de palabras de Wikitionary

No hay comentarios:

Publicar un comentario