{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pyparsing\n", "\n", "* *90 min* | Última modificación: Diciembre 17, 2020" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pyparsing as pp" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Hello', ',', 'World', '!']\n", "['Bonjour', ',', 'Monde', '!']\n", "['Hola', ',', 'Mundo', '!']\n", "['Hallo', ',', 'Welt', '!']\n" ] } ], "source": [ "##\n", "## Hola mundo\n", "##\n", "\n", "##\n", "## Define la gramática como:\n", "## \n", "## greet -> string ',' string '!'\n", "##\n", "greet = pp.Word(pp.alphas) + \",\" + pp.Word(pp.alphas) + \"!\"\n", "\n", "\n", "## procesa saludos en varios idiomas:\n", "for greeting_str in [\n", " \"Hello, World!\",\n", " \"Bonjour, Monde!\",\n", " \"Hola, Mundo!\",\n", " \"Hallo, Welt!\",\n", " ]:\n", " greeting = greet.parseString(greeting_str)\n", " print(greeting)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['111.222.333.444', '(123)456-7890']\n", "['131.322.393.458', '(599)353-7800']\n" ] } ], "source": [ "##\n", "## Pasing de una dirección IP y un número telefonico\n", "## en formato US:\n", "##\n", "## 111.222.333.444(123)456-7890\n", "## 131.322.393.458(599)353-7800\n", "##\n", "\n", "ipField = pp.Word(pp.nums, max=3)\n", "ipAddr = pp.Combine(ipField + \".\" + ipField + \".\" + ipField + \".\" + ipField)\n", "phoneNum = pp.Combine(\n", " \"(\"\n", " + pp.Word(pp.nums, exact=3)\n", " + \")\"\n", " + pp.Word(pp.nums, exact=3)\n", " + \"-\"\n", " + pp.Word(pp.nums, exact=4)\n", ")\n", "userdata = ipAddr + phoneNum\n", "\n", "for text in [\n", " \"111.222.333.444(123)456-7890\",\n", " \"131.322.393.458(599)353-7800\",\n", "]:\n", " parsed_text = userdata.parseString(text)\n", " print(parsed_text)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['a', '=', '10']\n", "['a_2', '=', '100']\n", "['pi', '=', '3.14159']\n", "['goldenRatio', '=', '1.61803']\n", "['E', '=', 'mc2']\n" ] } ], "source": [ "## \n", "## Ejemplo de la definición de una gramática\n", "## para asignaciones en lenguajes de programación.\n", "##\n", "## Ejemplo:\n", "##\n", "## a = 10\n", "## a_2=100\n", "## pi=3.14159\n", "## goldenRatio = 1.61803\n", "## E = mc2\n", "##\n", "identifier = pp.Word(pp.alphas, pp.alphanums+'_')\n", "number = pp.Word(pp.nums+\".\")\n", "assignmentExpr = identifier + \"=\" + (identifier | number)\n", "\n", "for text in [\n", " \"a = 10\",\n", " \"a_2=100\",\n", " \"pi=3.14159\",\n", " \"goldenRatio = 1.61803\",\n", " \"E = mc2\",\n", "]:\n", " parsed_text = assignmentExpr.parseString(text)\n", " print(parsed_text)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.14159 is assigned to pi\n" ] } ], "source": [ "##\n", "## Asignación de nombres a partes de la expresión\n", "## usando setResultName()\n", "##\n", "identifier = pp.Word(pp.alphas, pp.alphanums + \"_\")\n", "number = pp.Word(pp.nums + \".\")\n", "\n", "## define los nombres de las partes\n", "assignmentExpr = (\n", " identifier.setResultsName(\"lhs\") + \"=\" + (identifier | number).setResultsName(\"rhs\")\n", ")\n", "\n", "## parser\n", "assignmentTokens = assignmentExpr.parseString(\"pi=3.14159\")\n", "\n", "## imprime las componentes usando los nombres asignados\n", "print(assignmentTokens.rhs, \"is assigned to\", assignmentTokens.lhs)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Hello', ',', 'World', '!']\n", "['Hi', ',', 'Mom', '!']\n", "['Good', 'morning', ',', 'Miss', 'Crabtree', '!']\n", "['Yo', ',', 'Adrian', '!']\n", "['Whattup', ',', 'G', '?']\n", "[\"How's\", 'it', \"goin'\", ',', 'Dude', '?']\n", "['Hey', ',', 'Jude', '!']\n", "['Goodbye', ',', 'Mr.', 'Chips', '!']\n" ] } ], "source": [ "##\n", "## Ejemplo de una gramática más compleja para\n", "## parsear los siguientes textos:\n", "##\n", "## Hello, World!\n", "## Hi, Mom!\n", "## Good morning, Miss Crabtree!\n", "## Yo, Adrian!\n", "## Whattup, G?\n", "## How's it goin', Dude?\n", "## Hey, Jude!\n", "## Goodbye, Mr. Chips!\n", "##\n", "word = pp.Word(pp.alphas+\"'.\")\n", "salutation = pp.OneOrMore(word)\n", "comma = pp.Literal(\",\")\n", "greetee = pp.OneOrMore(word)\n", "endpunc = pp.oneOf(\"! ?\")\n", "greeting = salutation + comma + greetee + endpunc\n", "\n", "for text in [\n", " \"Hello, World!\",\n", " \"Hi, Mom!\",\n", " \"Good morning, Miss Crabtree!\",\n", " \"Yo, Adrian!\",\n", " \"Whattup, G?\",\n", " \"How's it goin', Dude?\",\n", " \"Hey, Jude!\",\n", " \"Goodbye, Mr. Chips!\",\n", "]:\n", " parsed_text = greeting.parseString(text)\n", " print(parsed_text)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Hello']\n", "['Hi']\n", "['Good', 'morning']\n", "['Yo']\n", "['Whattup']\n", "[\"How's\", 'it', \"goin'\"]\n", "['Hey']\n", "['Goodbye']\n" ] } ], "source": [ "##\n", "## Extracción de la parte del saludo\n", "##\n", "for t in [\n", " \"Hello, World!\",\n", " \"Hi, Mom!\",\n", " \"Good morning, Miss Crabtree!\",\n", " \"Yo, Adrian!\",\n", " \"Whattup, G?\",\n", " \"How's it goin', Dude?\",\n", " \"Hey, Jude!\",\n", " \"Goodbye, Mr. Chips!\",\n", "]:\n", " results = greeting.parseString(t)\n", " salutation = []\n", " for token in results:\n", " if token == \",\": break\n", " salutation.append(token)\n", " print(salutation)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['Hello'], ',', ['World'], '!']\n", "[['Hi'], ',', ['Mom'], '!']\n", "[['Good', 'morning'], ',', ['Miss', 'Crabtree'], '!']\n", "[['Yo'], ',', ['Adrian'], '!']\n", "[['Whattup'], ',', ['G'], '?']\n", "[[\"How's\", 'it', \"goin'\"], ',', ['Dude'], '?']\n", "[['Hey'], ',', ['Jude'], '!']\n", "[['Goodbye'], ',', ['Mr.', 'Chips'], '!']\n" ] } ], "source": [ "##\n", "## Adicion de grupos usando Group()\n", "##\n", "word = pp.Word(pp.alphas+\"'.\")\n", "salutation = pp.Group(pp.OneOrMore(word)) ## <- regla modificada\n", "comma = pp.Literal(\",\")\n", "greetee = pp.Group( pp.OneOrMore(word) ) ## <- regla modificada\n", "endpunc = pp.oneOf(\"! ?\")\n", "greeting = salutation + comma + greetee + endpunc\n", "\n", "for t in [\n", " \"Hello, World!\",\n", " \"Hi, Mom!\",\n", " \"Good morning, Miss Crabtree!\",\n", " \"Yo, Adrian!\",\n", " \"Whattup, G?\",\n", " \"How's it goin', Dude?\",\n", " \"Hey, Jude!\",\n", " \"Goodbye, Mr. Chips!\",\n", "]:\n", " parsed_text = greeting.parseString(t)\n", " print(parsed_text)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Hello'] ['World'] !\n", "['Hi'] ['Mom'] !\n", "['Good', 'morning'] ['Miss', 'Crabtree'] !\n", "['Yo'] ['Adrian'] !\n", "['Whattup'] ['G'] ?\n", "[\"How's\", 'it', \"goin'\"] ['Dude'] ?\n", "['Hey'] ['Jude'] !\n", "['Goodbye'] ['Mr.', 'Chips'] !\n" ] } ], "source": [ "##\n", "## Es posible asignar cada parte a una \n", "## variable para su uso posterior\n", "##\n", "for t in [\n", " \"Hello, World!\",\n", " \"Hi, Mom!\",\n", " \"Good morning, Miss Crabtree!\",\n", " \"Yo, Adrian!\",\n", " \"Whattup, G?\",\n", " \"How's it goin', Dude?\",\n", " \"Hey, Jude!\",\n", " \"Goodbye, Mr. Chips!\",\n", "]:\n", " salutation, _, greetee, endpunc = greeting.parseString(t)\n", " print(salutation, greetee, endpunc)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Hello'] ['World'] !\n", "['Hi'] ['Mom'] !\n", "['Good', 'morning'] ['Miss', 'Crabtree'] !\n", "['Yo'] ['Adrian'] !\n", "['Whattup'] ['G'] ?\n", "[\"How's\", 'it', \"goin'\"] ['Dude'] ?\n", "['Hey'] ['Jude'] !\n", "['Goodbye'] ['Mr.', 'Chips'] !\n" ] } ], "source": [ "##\n", "## Supresión de elementos con Suppress\n", "##\n", "word = pp.Word(pp.alphas+\"'.\")\n", "salutation = pp.Group(pp.OneOrMore(word)) ## <- regla modificada\n", "comma = pp.Suppress(pp.Literal(\",\"))\n", "greetee = pp.Group( pp.OneOrMore(word) ) ## <- regla modificada\n", "endpunc = pp.oneOf(\"! ?\")\n", "greeting = salutation + comma + greetee + endpunc\n", "\n", "\n", "for t in [\n", " \"Hello, World!\",\n", " \"Hi, Mom!\",\n", " \"Good morning, Miss Crabtree!\",\n", " \"Yo, Adrian!\",\n", " \"Whattup, G?\",\n", " \"How's it goin', Dude?\",\n", " \"Hey, Jude!\",\n", " \"Goodbye, Mr. Chips!\",\n", "]:\n", " salutation, greetee, endpunc = greeting.parseString(t)\n", " print(salutation, greetee, endpunc)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('Hello', '!'), ('Hi', '!'), ('Good morning', '!'), ('Yo', '!'), ('Whattup', '?'), (\"How's it goin'\", '?'), ('Hey', '!'), ('Goodbye', '!')]\n", "---\n", "['World', 'Mom', 'Miss Crabtree', 'Adrian', 'G', 'Dude', 'Jude', 'Mr. Chips']\n" ] } ], "source": [ "##\n", "## Separación en partes usando listas\n", "##\n", "salutes = []\n", "greetees = []\n", "\n", "for t in [\n", " \"Hello, World!\",\n", " \"Hi, Mom!\",\n", " \"Good morning, Miss Crabtree!\",\n", " \"Yo, Adrian!\",\n", " \"Whattup, G?\",\n", " \"How's it goin', Dude?\",\n", " \"Hey, Jude!\",\n", " \"Goodbye, Mr. Chips!\",\n", "]:\n", " salutation, greetee, endpunc = greeting.parseString(t)\n", " salutes.append( ( \" \".join(salutation), endpunc) )\n", " greetees.append( \" \".join(greetee) )\n", " \n", "print(salutes)\n", "print('---')\n", "print(greetees)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Goodbye, G!\n", "Hello, World!\n", "Hi, Dude!\n", "Hey, Mom!\n", "Goodbye, Jude!\n", "Yo, World!\n", "Hey, World!\n", "Goodbye, Mr. Chips!\n", "Hey, Adrian!\n", "Good morning, Adrian!\n", "Hello, Miss Crabtree!\n", "Hello, Miss Crabtree!\n", "Hello, Mom!\n", "How's it goin', Miss Crabtree?\n", "Goodbye, Miss Crabtree!\n", "Yo, Miss Crabtree!\n", "Hi, Jude!\n", "Yo, Mr. Chips!\n", "Hey, Miss Crabtree!\n", "Hey, Jude!\n" ] } ], "source": [ "##\n", "## Generación de cadenas aleatorias\n", "##\n", "import random\n", "\n", "for i in range(20):\n", " salute = random.choice( salutes )\n", " greetee = random.choice( greetees )\n", " print(\"{:s}, {:s}{:s}\".format( salute[0], greetee, salute[1] ))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Adrian, say \"Hey!\" to Mom.\n", "G, say \"Goodbye!\" to Adrian.\n", "Mom, say \"Goodbye!\" to Dude.\n", "Mr. Chips, say \"Goodbye!\" to Miss Crabtree.\n", "Jude, say \"Good morning!\" to Mr. Chips.\n", "Adrian, say \"Yo!\" to Dude.\n", "Jude, say \"Yo!\" to Dude.\n", "Mom, say \"Hi!\" to World.\n", "World, say \"Hi!\" to G.\n", "G, say \"Good morning!\" to Mom.\n", "Dude, say \"Hi!\" to G.\n", "G, say \"Goodbye!\" to Jude.\n", "Jude, say \"Good morning!\" to Adrian.\n", "Adrian, say \"Hi!\" to G.\n", "World, say \"Whattup?\" to Miss Crabtree.\n", "Dude, say \"Hi!\" to Jude.\n", "World, say \"Hey!\" to Miss Crabtree.\n", "Adrian, say \"Good morning!\" to Jude.\n", "G, say \"Hello!\" to Mom.\n", "Miss Crabtree, say \"Yo!\" to Adrian.\n" ] } ], "source": [ "##\n", "## Otro ejemplo de frases aleatorias\n", "##\n", "for i in range(20):\n", " print(\n", " '{:s}, say \"{:s}\" to {:s}.'.format(\n", " random.choice(greetees),\n", " \"\".join(random.choice(salutes)),\n", " random.choice(greetees),\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['09', '/', '04', '/', '2004', 'Virginia', '44', 'Temple', '14']\n", "['09', '/', '04', '/', '2004', 'LSU', '22', 'Oregon', 'State', '21']\n", "['09', '/', '09', '/', '2004', 'Troy', 'State', '24', 'Missouri', '14']\n", "['01', '/', '02', '/', '2003', 'Florida', 'State', '103', 'University', 'of', 'Miami', '2']\n" ] } ], "source": [ "##\n", "## Ejemplo de un texto\n", "##\n", "## 09/04/2004 Virginia 44 Temple 14\n", "## 09/04/2004 LSU 22 Oregon State 21\n", "## 09/09/2004 Troy State 24 Missouri 14\n", "## 01/02/2003 Florida State 103 University of Miami 2\n", "##\n", "\n", "\n", "##\n", "## Gramática básica para capturar los datos\n", "##\n", "num = pp.Word(pp.nums)\n", "date = num + \"/\" + num + \"/\" + num\n", "schoolName = pp.OneOrMore( pp.Word(pp.alphas) )\n", "score = pp.Word(pp.nums)\n", "schoolAndScore = schoolName + score\n", "gameResult = date + schoolAndScore + schoolAndScore\n", "\n", "tests = \"\"\"\\\n", "09/04/2004 Virginia 44 Temple 14\n", "09/04/2004 LSU 22 Oregon State 21\n", "09/09/2004 Troy State 24 Missouri 14\n", "01/02/2003 Florida State 103 University of Miami 2\"\"\".splitlines()\n", "\n", "for test in tests:\n", " stats = gameResult.parseString(test)\n", " print(stats.asList())" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['09/04/2004', 'Virginia', '44', 'Temple', '14']\n", "['09/04/2004', 'LSU', '22', 'Oregon', 'State', '21']\n", "['09/09/2004', 'Troy', 'State', '24', 'Missouri', '14']\n", "['01/02/2003', 'Florida', 'State', '103', 'University', 'of', 'Miami', '2']\n" ] } ], "source": [ "##\n", "## Combina los strings para las fechas\n", "## usando Combine\n", "##\n", "num = pp.Word(pp.nums)\n", "date = pp.Combine(num + \"/\" + num + \"/\" + num)\n", "schoolName = pp.OneOrMore( pp.Word(pp.alphas) )\n", "score = pp.Word(pp.nums)\n", "schoolAndScore = schoolName + score\n", "gameResult = date + schoolAndScore + schoolAndScore\n", "\n", "tests = \"\"\"\\\n", "09/04/2004 Virginia 44 Temple 14\n", "09/04/2004 LSU 22 Oregon State 21\n", "09/09/2004 Troy State 24 Missouri 14\n", "01/02/2003 Florida State 103 University of Miami 2\"\"\".splitlines()\n", "\n", "for test in tests:\n", " stats = gameResult.parseString(test)\n", " print(stats.asList())\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['09/04/2004', 'Virginia', '44', 'Temple', '14']\n", "['09/04/2004', 'LSU', '22', 'Oregon State', '21']\n", "['09/09/2004', 'Troy State', '24', 'Missouri', '14']\n", "['01/02/2003', 'Florida State', '103', 'University of Miami', '2']\n" ] } ], "source": [ "##\n", "## Combina los strings de los nombres\n", "##\n", "num = pp.Word(pp.nums)\n", "date = pp.Combine(num + \"/\" + num + \"/\" + num)\n", "schoolName = pp.OneOrMore( pp.Word(pp.alphas) )\n", "schoolName.setParseAction( lambda tokens: \" \".join(tokens) ) # <- modificación\n", "score = pp.Word(pp.nums)\n", "schoolAndScore = schoolName + score\n", "gameResult = date + schoolAndScore + schoolAndScore\n", "\n", "tests = \"\"\"\\\n", "09/04/2004 Virginia 44 Temple 14\n", "09/04/2004 LSU 22 Oregon State 21\n", "09/09/2004 Troy State 24 Missouri 14\n", "01/02/2003 Florida State 103 University of Miami 2\"\"\".splitlines()\n", "\n", "for test in tests:\n", " stats = gameResult.parseString(test)\n", " print(stats.asList())\n", "\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "ename": "ParseException", "evalue": "Invalid date string (19/04/2004) (at char 0), (line:1, col:1)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36mvalidateDateString\u001b[0;34m(tokens)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrptime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"%m/%d/%Y\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib/python3.6/_strptime.py\u001b[0m in \u001b[0;36m_strptime_time\u001b[0;34m(data_string, format)\u001b[0m\n\u001b[1;32m 558\u001b[0m format string.\"\"\"\n\u001b[0;32m--> 559\u001b[0;31m \u001b[0mtt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_strptime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_string\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 560\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstruct_time\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtt\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_STRUCT_TM_ITEMS\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib/python3.6/_strptime.py\u001b[0m in \u001b[0;36m_strptime\u001b[0;34m(data_string, format)\u001b[0m\n\u001b[1;32m 361\u001b[0m raise ValueError(\"time data %r does not match format %r\" %\n\u001b[0;32m--> 362\u001b[0;31m (data_string, format))\n\u001b[0m\u001b[1;32m 363\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_string\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mfound\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: time data '19/04/2004' does not match format '%m/%d/%Y'", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mParseException\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtests\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 29\u001b[0;31m \u001b[0mstats\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgameResult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparseString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 30\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masList\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/pyparsing.py\u001b[0m in \u001b[0;36mparseString\u001b[0;34m(self, instring, parseAll)\u001b[0m\n\u001b[1;32m 1953\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__traceback__'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1954\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__traceback__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_trim_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__traceback__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1955\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1956\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1957\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mvalidateDateString\u001b[0;34m(tokens)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrptime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"%m/%d/%Y\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mpp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mParseException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Invalid date string (%s)\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mdate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetParseAction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalidateDateString\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mParseException\u001b[0m: Invalid date string (19/04/2004) (at char 0), (line:1, col:1)" ] } ], "source": [ "##\n", "## Validación de las fechas\n", "##\n", "num = pp.Word(pp.nums)\n", "date = pp.Combine(num + \"/\" + num + \"/\" + num)\n", "schoolName = pp.OneOrMore( pp.Word(pp.alphas) )\n", "schoolName.setParseAction( lambda tokens: \" \".join(tokens) ) # <- modificación\n", "score = pp.Word(pp.nums)\n", "schoolAndScore = schoolName + score\n", "gameResult = date + schoolAndScore + schoolAndScore\n", "\n", "import time\n", "def validateDateString(tokens):\n", " try:\n", " time.strptime(tokens[0], \"%m/%d/%Y\")\n", " except ValueError:\n", " raise pp.ParseException(\"Invalid date string (%s)\" % tokens[0])\n", "\n", "date.setParseAction(validateDateString) \n", " \n", "## se modifica la primera linea para generar el error \n", "tests = \"\"\"\\\n", "19/04/2004 Virginia 44 Temple 14\n", "09/04/2004 LSU 22 Oregon State 21\n", "09/09/2004 Troy State 24 Missouri 14\n", "01/02/2003 Florida State 103 University of Miami 2\"\"\".splitlines()\n", "\n", "for test in tests:\n", " stats = gameResult.parseString(test)\n", " print(stats.asList())\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['09/04/2004', ['Virginia', 44], ['Temple', 14]]\n", "['09/04/2004', ['LSU', 22], ['Oregon State', 21]]\n", "['09/09/2004', ['Troy State', 24], ['Missouri', 14]]\n", "['01/02/2003', ['Florida State', 103], ['University of Miami', 2]]\n" ] } ], "source": [ "##\n", "## Se convierte en entero el score y se agrupa \n", "##\n", "num = pp.Word(pp.nums)\n", "date = pp.Combine(num + \"/\" + num + \"/\" + num)\n", "schoolName = pp.OneOrMore( pp.Word(pp.alphas) )\n", "schoolName.setParseAction( lambda tokens: \" \".join(tokens) ) \n", "score = pp.Word(pp.nums).setParseAction( lambda tokens : int(tokens[0]) ) # <- modificación\n", "schoolAndScore = pp.Group(schoolName + score)\n", "gameResult = date + schoolAndScore + schoolAndScore\n", "\n", "import time\n", "def validateDateString(tokens):\n", " try:\n", " time.strptime(tokens[0], \"%m/%d/%Y\")\n", " except ValueError:\n", " raise pp.ParseException(\"Invalid date string (%s)\" % tokens[0])\n", "\n", "date.setParseAction(validateDateString) \n", " \n", "## se modifica la primera linea para generar el error \n", "tests = \"\"\"\\\n", "09/04/2004 Virginia 44 Temple 14\n", "09/04/2004 LSU 22 Oregon State 21\n", "09/09/2004 Troy State 24 Missouri 14\n", "01/02/2003 Florida State 103 University of Miami 2\"\"\".splitlines()\n", "\n", "for test in tests:\n", " stats = gameResult.parseString(test)\n", " print(stats.asList())\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "09/04/2004 Virginia(44) Temple(14), won by Virginia\n", "09/04/2004 LSU(22) Oregon State(21), won by LSU\n", "09/09/2004 Troy State(24) Missouri(14), won by Troy State\n", "01/02/2003 Florida State(103) University of Miami(2), won by Florida State\n" ] } ], "source": [ "##\n", "## Introducción de textos explicativos\n", "##\n", "for test in tests:\n", " stats = gameResult.parseString(test)\n", " if stats[1][1] != stats[2][1]:\n", " if stats[1][1] > stats[2][1]:\n", " result = \"won by \" + stats[1][0]\n", " else:\n", " result = \"won by \" + stats[2][0]\n", " else:\n", " result = \"tied\"\n", " print(\n", " \"{:s} {:s}({:d}) {:s}({:d}), {:s}\".format(\n", " stats[0], stats[1][0], stats[1][1], stats[2][0], stats[2][1], result\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "09/04/2004 Virginia(44) Temple(14), won by Virginia\n", "09/04/2004 LSU(22) Oregon State(21), won by LSU\n", "09/09/2004 Troy State(24) Missouri(14), won by Troy State\n", "01/02/2003 Florida State(103) University of Miami(2), won by Florida State\n" ] } ], "source": [ "##\n", "## Manejo de nombres para aumentar la legibilidad\n", "##\n", "num = pp.Word(pp.nums)\n", "date = pp.Combine(num + \"/\" + num + \"/\" + num)\n", "schoolName = pp.OneOrMore(pp.Word(pp.alphas))\n", "schoolName.setParseAction(lambda tokens: \" \".join(tokens))\n", "score = pp.Word(pp.nums).setParseAction(\n", " lambda tokens: int(tokens[0])\n", ") \n", "schoolAndScore = pp.Group(\n", " schoolName.setResultsName(\"school\") + score.setResultsName(\"score\")\n", ")\n", "gameResult = (\n", " date.setResultsName(\"date\")\n", " + schoolAndScore.setResultsName(\"team1\")\n", " + schoolAndScore.setResultsName(\"team2\")\n", ")\n", "\n", "date.setParseAction(validateDateString)\n", "\n", "\n", "for test in tests:\n", " stats = gameResult.parseString(test)\n", " if stats.team1.score != stats.team2.score:\n", " if stats.team1.score > stats.team2.score:\n", " result = \"won by \" + stats.team1.school\n", " else:\n", " result = \"won by \" + stats.team2.school\n", " else:\n", " result = \"tied\"\n", " print(\n", " \"{:s} {:s}({:d}) {:s}({:d}), {:s}\".format(\n", " stats.date,\n", " stats.team1.school,\n", " stats.team1.score,\n", " stats.team2.school,\n", " stats.team2.score,\n", " result,\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['01/02/2003', ['Florida State', 103], ['University of Miami', 2]]\n", "- date: '01/02/2003'\n", "- team1: ['Florida State', 103]\n", " - school: 'Florida State'\n", " - score: 103\n", "- team2: ['University of Miami', 2]\n", " - school: 'University of Miami'\n", " - score: 2\n" ] } ], "source": [ "##\n", "## Se puede usar dump() para imprimir la info\n", "## y revisar\n", "##\n", "print(stats.dump())" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " 01/02/2003\n", " \n", " Florida State\n", " 103\n", " \n", " \n", " University of Miami\n", " 2\n", " \n", "\n" ] } ], "source": [ "##\n", "## Se puede generar XML\n", "##\n", "print(stats.asXML(\"GAME\"))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'About Menu' : ../images/image-about.jpg\n", "'Careers Menu' : ../images/image-careers.jpg\n", "'Offices Menu' : ../images/image-offices.jpg\n", "'News Menu' : ../images/image-news.jpg\n", "'Library Menu' : ../images/image-library.jpg\n" ] } ], "source": [ "##\n", "## Lectura de HTML\n", "##\n", "from pyparsing import makeHTMLTags\n", "import urllib\n", "\n", "url = \"https://www.cia.gov/library/publications/the-world-factbook/docs/refmaps.html\"\n", "html = urllib.request.urlopen(url).read()\n", "\n", "## Define la expresión para el tag \n", "imgTag,endImgTag = makeHTMLTags(\"img\")\n", "\n", "## busca el tag e imprime los atributos\n", "for img in imgTag.searchString(html):\n", " if img['src'].endswith('jpg'):\n", " print(\"'{:s}' : {:s}\".format(img['alt'], img['src']))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "##\n", "## Lectura de las componentes de una tabla\n", "## (corregir el siguiente codigo)\n", "##\n", "\n", "import urllib\n", "from pyparsing import *\n", "\n", "url = (\n", " \"https://www.cia.gov/library/\"\n", " \"publications/the-world-factbook/\"\n", " \"appendix/appendix-g.html\"\n", ")\n", "\n", "## abre la pagina y la lee\n", "page = urllib.request.urlopen(url)\n", "html = page.read()\n", "page.close()\n", "\n", "## crea los tags de la tabla\n", "tdStart, tdEnd = makeHTMLTags(\"td\")\n", "trStart, trEnd = makeHTMLTags(\"tr\")\n", "\n", "## especificación del parser para las componentes de la tabla\n", "decimalNumber = Word(nums + \",\") + Optional(\".\" + OneOrMore(Word(nums)))\n", "joinTokens = lambda tokens: \"\".join(tokens)\n", "stripCommas = lambda tokens: tokens[0].replace(\",\", \"\")\n", "convertToFloat = lambda tokens: float(tokens[0])\n", "decimalNumber.setParseAction(joinTokens, stripCommas, convertToFloat)\n", "conversionValue = tdStart + decimalNumber.setResultsName(\"factor\") + tdEnd\n", "units = SkipTo(tdEnd)\n", "\n", "## rutina auxiliar para limpiar la tabla\n", "def htmlCleanup(t):\n", " unitText = t[0]\n", " unitText = \" \".join(unitText.split())\n", " unitText = unitText.replace(\"
\", \"\")\n", " return unitText\n", "\n", "units.setParseAction(htmlCleanup)\n", "\n", "## componente del parser para leer la tabla\n", "fromUnit = tdStart + units.setResultsName(\"fromUnit\") + tdEnd\n", "toUnit = tdStart + units.setResultsName(\"toUnit\") + tdEnd\n", "conversion = trStart + fromUnit + toUnit + conversionValue + trEnd\n", "\n", "## imprime los resultados\n", "for tokens, start, end in conversion.scanString(html):\n", " print(tokens, '>')\n", " print(\n", " \"{:s} : {:s} : {:s}\".format(\n", " tokens[\"fromUnit\"], tokens[\"toUnit\"], tokens[\"factor\"]\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " red\n", "['red']\n", "\n", " 100\n", "['100']\n", "\n", " ( red 100 blue )\n", "['red', '100', 'blue']\n", "\n", " ( green ( ( 1 2 ) mauve ) plaid () )\n", "['green', '1', '2', 'mauve', 'plaid']\n", "\n" ] } ], "source": [ "##\n", "## Ejemplo S-expression Parser\n", "## \n", "## 1\n", "## x\n", "## (+ 1 2)\n", "## (* (+ 1 2) (+ 3 4))\n", "##\n", "\n", "alphaword = pp.Word(pp.alphas)\n", "integer = pp.Word(pp.nums)\n", "\n", "sexp = Forward()\n", "\n", "LPAREN = pp.Suppress(\"(\")\n", "RPAREN = pp.Suppress(\")\")\n", "\n", "sexp << ( alphaword | integer | ( LPAREN + ZeroOrMore(sexp) + RPAREN ))\n", "\n", "tests = \"\"\"\\\n", " red\n", " 100\n", " ( red 100 blue )\n", " ( green ( ( 1 2 ) mauve ) plaid () )\"\"\".splitlines()\n", "for t in tests:\n", " print(t)\n", " print(sexp.parseString(t))\n", " print()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " red\n", "['red']\n", "\n", " 100\n", "['100']\n", "\n", " ( red 100 blue )\n", "[['red', '100', 'blue']]\n", "\n", " ( green ( ( 1 2 ) mauve ) plaid () )\n", "[['green', [['1', '2'], 'mauve'], 'plaid', []]]\n", "\n" ] } ], "source": [ "##\n", "## Agrupación de las expresiones S\n", "##\n", "alphaword = pp.Word(pp.alphas)\n", "integer = pp.Word(pp.nums)\n", "\n", "sexp = Forward()\n", "\n", "LPAREN = pp.Suppress(\"(\")\n", "RPAREN = pp.Suppress(\")\")\n", "\n", "sexp << ( alphaword | integer | pp.Group( LPAREN + ZeroOrMore(sexp) + RPAREN ) ) # <--\n", "\n", "tests = \"\"\"\\\n", " red\n", " 100\n", " ( red 100 blue )\n", " ( green ( ( 1 2 ) mauve ) plaid () )\"\"\".splitlines()\n", "for t in tests:\n", " print(t)\n", " print(sexp.parseString(t))\n", " print()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "wood and blue or red\n", "[['wood', 'and', 'blue'], 'or', 'red']\n", "\n", "wood and (blue or red)\n", "['wood', 'and', ['blue', 'or', 'red']]\n", "\n", "(steel or iron) and \"lime green\"\n", "[['steel', 'or', 'iron'], 'and', 'lime green']\n", "\n", "not steel or iron and \"lime green\"\n", "[['not', 'steel'], 'or', ['iron', 'and', 'lime green']]\n", "\n", "not(steel or iron) and \"lime green\"\n", "[['not', ['steel', 'or', 'iron']], 'and', 'lime green']\n", "\n" ] } ], "source": [ "##\n", "## Parser para expresiones de búsqueda\n", "##\n", "## wood and blue or red\n", "## wood and (blue or red)\n", "## (steel or iron) and \"lime green\"\n", "## not steel or iron and \"lime green\"\n", "## not(steel or iron) and \"lime green\"\n", "##\n", "\n", "from pyparsing import *\n", "\n", "and_ = CaselessLiteral(\"and\")\n", "or_ = CaselessLiteral(\"or\")\n", "not_ = CaselessLiteral(\"not\")\n", "searchTerm = Word(alphanums) | quotedString.setParseAction(removeQuotes)\n", "searchExpr = operatorPrecedence(\n", " searchTerm,\n", " [\n", " (not_, 1, opAssoc.RIGHT),\n", " (and_, 2, opAssoc.LEFT),\n", " (or_, 2, opAssoc.LEFT),\n", " ],\n", ")\n", "tests = \"\"\"\\\n", " wood and blue or red\n", " wood and (blue or red)\n", " (steel or iron) and \"lime green\"\n", " not steel or iron and \"lime green\"\n", " not(steel or iron) and \"lime green\" \"\"\".splitlines()\n", "\n", "for t in tests:\n", " print(t.strip())\n", " print(searchExpr.parseString(t)[0])\n", " print()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }