Dup Ver Goto 📝

Get Webpage Titles

PT2/lang/python/web/example python web does not exist
To
103 lines, 288 words, 2261 chars Page 'GetWebpageTitles' does not exist.

A mess. Really needs to be rewritten to use argparse. But if I want to generate a bulleted list of links for this wiki, given a list of URL's, this does the job.

#!/usr/bin/env python
import os
import requests
import json
import re
import sys
urlre = re.compile(r"^https?://")

args = sys.argv[1:]
progname = sys.argv[0]
stdin = None
lines = []
html = False
markdown = False
numeric = False
args2 = []
for arg in args:
  if arg == "-":
    args2.append(arg)
  elif arg.startswith("-"):
    if arg in ["--html","-h"]:
      html = True
    elif arg in ["--markdown","-m","--md","--ptmd"]:
      markdown = True
    elif arg in ["--numeric","-n"]:
      numeric = True
    elif arg == "--help":
      print(f"{progname}")
      print("  -m --md --markdown --ptmd -p: markdown output")
      print("  -h --html: html output (overridden by markdown")
      print("  -n --numeric: numbered lists (else bulleted)")
      exit(0)
    else:
      print("Unrecognised switch",arg)
  else:
    args2.append(arg)

trim = os.getenv("TRIM","")

if len(args2) == 0:
  args2 = ["-"]
for arg in args2:
  if arg == "-":
    if stdin is None:
      stdin = sys.stdin.read()
    lines.append(stdin)
  else:
    try:
      with open(arg) as f:
        lines.append(f.read().rstrip())
    except Exception as e:
      print(f"Exception {e} {type(e)} reading {arg}")
      continue
lines = "\n".join(lines)
lines = lines.split("\n")
lines = [ x for x in lines if x.strip() != "" ]

if markdown:
  html = False

if html:
  if numeric:
    print("<ol>")
  else:
    print("<ul>")

idx = 0
for line in lines:
  if not urlre.match(line):
    print(f"Not url: {line}")
    continue
  url = line
  r = requests.get(url)
  t = r.text.replace("\r","").replace("\n"," ")
  m = re.search(r"<title[^>]*>(.*?)</title>",t,re.I)
  if not m:
    print(f"No title for {url}")
    continue
  title = m.group(1)
  if trim != "":
    title = re.sub(f"{trim}$","",title)
  title = title.strip()
  idx += 1
  if markdown:
    if numeric:
      print(f"{idx}. [{title}]({url})")
    else:
      print(f"* [{title}]({url})")
  elif html:
    print(f"<li><a href='{url}'>{title}</a></li>")
  else:
    print(f"{url} {title}")

if html:
  if numeric:
    print("</ol>")
  else:
    print("</ul>")