Skip to content

Commit 372f6d7

Browse files
committed
Add a series of scripts for fetching and parsing OpenSSL.org manpages
1 parent 88e163d commit 372f6d7

File tree

4 files changed

+157
-0
lines changed

4 files changed

+157
-0
lines changed
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/bin/sh
2+
3+
bindir=$(dirname "$0")
4+
versions="1.0.2 1.1.1 3.0 3.1"
5+
for ver in $versions; do
6+
d="$ver"
7+
mkdir -p "$d"
8+
"$bindir/openssl_manpage_libcall_scrape.py" https://www.openssl.org/docs/man${ver}/man3/ | \
9+
xargs -n 10 -P 5 "$bindir/../concurrent_fetch.py" --output-dir="$d" --verbose --retries=3
10+
done
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/usr/bin/env python3
2+
"""Scrape library manpage listing from OpenSSL.org.
3+
"""
4+
5+
import argparse
6+
import urllib.parse
7+
from typing import Optional
8+
9+
import bs4
10+
import requests
11+
12+
13+
BS4_PARSER = "html5lib"
14+
15+
16+
def parse_args(argv : Optional[str] = None) -> argparse.Namespace:
17+
parser = argparse.ArgumentParser()
18+
parser.add_argument("url")
19+
20+
return parser.parse_args(args=argv)
21+
22+
23+
def get_manpage_html_text(url: str) -> str:
24+
resp = requests.get(url)
25+
resp.raise_for_status()
26+
return resp.text
27+
28+
29+
def extract_libcall_links_from_page(text: str) -> str:
30+
soup = bs4.BeautifulSoup(text, BS4_PARSER)
31+
return [
32+
link["href"] for link in soup.body.article.table.tbody.find_all("a", href=True)
33+
]
34+
35+
36+
def main(argv : Optional[str] = None) -> None:
37+
args = parse_args(argv)
38+
text = get_manpage_html_text(args.url)
39+
parsed_links = extract_libcall_links_from_page(text)
40+
for link in parsed_links:
41+
print(urllib.parse.urljoin(args.url, link))
42+
43+
44+
if __name__ == "__main__":
45+
main()
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env python3
2+
"""Scrape library manpage output from OpenSSL.org.
3+
"""
4+
5+
import argparse
6+
import html
7+
import logging
8+
import os
9+
import pathlib
10+
import re
11+
import subprocess
12+
from typing import Optional
13+
14+
import bs4
15+
16+
17+
INCLUDE_RE = re.compile(r"#include <openssl/")
18+
FLATTEN_DECLARATIONS_RE = re.compile(",\n", re.M)
19+
BS4_PARSER = "html5lib"
20+
21+
22+
def parse_args(argv : Optional[str] = None) -> argparse.Namespace:
23+
parser = argparse.ArgumentParser()
24+
parser.add_argument("url")
25+
parser.add_argument("output", type=argparse.FileType("w"))
26+
27+
return parser.parse_args(args=argv)
28+
29+
30+
def fetch_manpage_html_text(url: str) -> str:
31+
import requests
32+
resp = requests.get(url)
33+
resp.raise_for_status()
34+
return resp.text
35+
36+
37+
def extract_synopsis_from_manpage_html(text: str) -> str:
38+
soup = bs4.BeautifulSoup(text, BS4_PARSER)
39+
for elem in soup.body.find_all("pre"):
40+
code_elems = elem.find_all("code")
41+
if not code_elems or len(code_elems) != 1:
42+
continue
43+
synopsis_text = html.unescape(code_elems[0].text.lstrip())
44+
if INCLUDE_RE.match(synopsis_text) is None:
45+
synopsis_text = INCLUDE_RE.sub("", synopsis_text)
46+
return FLATTEN_DECLARATIONS_RE.sub(", ", synopsis_text)
47+
48+
49+
BINDIR = os.path.dirname(__file__)
50+
51+
def main(argv : Optional[str] = None) -> None:
52+
args = parse_args(argv)
53+
54+
logging.basicConfig()
55+
logging.getLogger().setLevel(logging.DEBUG)
56+
logging.debug("Will parse synopsis from %s", args.url)
57+
58+
url_path = pathlib.Path(args.url)
59+
try:
60+
text = (
61+
url_path.read_text() if url_path.exists()
62+
else fetch_manpage_html_text(args.url)
63+
)
64+
synopsis = extract_synopsis_from_manpage_html(text)
65+
if not synopsis:
66+
raise AssertionError(f"{args.url} has no synopsis")
67+
with args.output as output_fp:
68+
output_fp.write(synopsis)
69+
subprocess.check_call(["clang-format", "-i", f"--style=file:{BINDIR}/.clang_format", output_fp.name])
70+
except AssertionError as exc:
71+
print(exc)
72+
os.unlink(args.output.name)
73+
except Exception:
74+
logging.exception("")
75+
os.unlink(args.output.name)
76+
77+
logging.debug("Parsed synopsis from %s into %s", args.url, args.output.name)
78+
79+
80+
if __name__ == "__main__":
81+
main()
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/sh
2+
3+
set -eu
4+
5+
max_workers=10
6+
dir_path=$(realpath "$1")
7+
set -- $(find "$dir_path" -type f -name \*.html)
8+
while [ $# -gt 0 ]; do
9+
allocated_workers=0
10+
set +u
11+
for i in $(seq 1 $max_workers); do
12+
html_file=$1; shift
13+
if [ $# -gt 0 ]; then
14+
./parse_synopsis_from_openssl_html_manpage.py $html_file ${html_file%.html*}.txt &
15+
: $(( allocated_workers += 1 ))
16+
fi
17+
done
18+
for i in $(seq 1 $allocated_workers); do
19+
wait
20+
done
21+
done

0 commit comments

Comments
 (0)