Initial commit
This commit is contained in:
commit
56a79e3562
55 changed files with 1905 additions and 0 deletions
0
src/nrsk/__init__.py
Normal file
0
src/nrsk/__init__.py
Normal file
0
src/nrsk/regs/__init__.py
Normal file
0
src/nrsk/regs/__init__.py
Normal file
168
src/nrsk/regs/load_10cfr50.py
Executable file
168
src/nrsk/regs/load_10cfr50.py
Executable file
|
|
@ -0,0 +1,168 @@
|
|||
"""
|
||||
Read text 10 CFR 50 App B and turn to json and xlsx.
|
||||
|
||||
Text should be loaded from web and pasted into data folder.
|
||||
|
||||
Needs to eventually write requirements data to flow down to QA program.
|
||||
|
||||
One way to do that is to write a ``needs.json`` file to the format
|
||||
described `here <https://sphinx-needs.readthedocs.io/en/latest/builders.html#format>`_.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from nrsk.utils import excel, get_project_root, needs, tokenize
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
ROOT = get_project_root()
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
TXT_10_CFR_50_APPA = ROOT / "data" / "regs" / "10-cfr-50-app-a.txt"
|
||||
TXT_10_CFR_50_APPB = ROOT / "data" / "regs" / "10-cfr-50-app-b.txt"
|
||||
OUT_PATH = ROOT / "documents" / "generated_assets"
|
||||
|
||||
# todo: move to config file in case they change?
|
||||
URL_APP_A = (
|
||||
"https://www.nrc.gov/reading-rm/doc-collections/cfr/part050/part050-appa.html"
|
||||
)
|
||||
URL_APP_B = (
|
||||
"https://www.nrc.gov/reading-rm/doc-collections/cfr/part050/part050-appb.html"
|
||||
)
|
||||
|
||||
|
||||
def fetch_10cfr50_app_a():
|
||||
"""Go online and get the text of 10CFR50 App A.
|
||||
|
||||
We still commit the results to the repo because this system
|
||||
may be run in an environment that does not have internet access.
|
||||
|
||||
This is here to show how the committed files are created in the first place.
|
||||
|
||||
Note that this may require changes if the NRC updates the format of the
|
||||
webpage.
|
||||
"""
|
||||
|
||||
def _is_criteria_label(tag):
|
||||
return tag.name == "p" and tag.string == "Criteria"
|
||||
|
||||
html_app_a = requests.get(URL_APP_A).text
|
||||
soup = BeautifulSoup(html_app_a, "html.parser")
|
||||
# criteria are all in the last general-content div
|
||||
content = soup.find_all("div", class_="general-content")[-1]
|
||||
label = content.find_next(_is_criteria_label)
|
||||
|
||||
with TXT_10_CFR_50_APPA.open("w") as f:
|
||||
for crit in label.find_all_next("p"):
|
||||
if "1 Further details relating" in crit.text:
|
||||
# marks the end
|
||||
break
|
||||
f.write(crit.text + "\n\n")
|
||||
|
||||
|
||||
def process_10cfr50_app_a():
|
||||
"""
|
||||
Convert App A text (General Design Criteria) into Sphinx Needs.
|
||||
|
||||
Each criteria actually has a name and then a number of shall sentences
|
||||
below it. So each number should make 1 main need + numerous child needs.
|
||||
|
||||
Input text file should start in the Criteria section, e.g. at Overall
|
||||
Requirements.
|
||||
|
||||
Since this makes parent/child requirements, we'll try using the Sphinx-needs
|
||||
list2need feature for this one.
|
||||
|
||||
https://sphinx-needs.readthedocs.io/en/latest/directives/list2need.html
|
||||
|
||||
list2need is messing up labels on (1) and (2) listings in the text so I'm going to remap
|
||||
those over to 1. and 2., etc.
|
||||
|
||||
Groan it's also messing up on non-numerical parentheticals. Must be a sphinx-needs bug.
|
||||
Will map those to comma phrases.
|
||||
|
||||
Groan even then it's still messed up.
|
||||
|
||||
.. todo:: submit a bug report to sphinx-needs with these examples.
|
||||
"""
|
||||
|
||||
def process(line):
|
||||
line = re.sub(r"\((\d+)\)", r"\1.", line)
|
||||
line = re.sub(r"\((.+)\)", r", \1,", line)
|
||||
return line
|
||||
|
||||
needs_lines = [".. list2need::", " :types: req, req, req", ""]
|
||||
with open(TXT_10_CFR_50_APPA) as inp_f:
|
||||
for line in inp_f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
if re.search(r"^[A-Z]+\. [A-Z]", line):
|
||||
LOG.info("Reading %s", line)
|
||||
elif match := re.search(r"^Criterion (\d+)—(.+?)\.(.+)", line):
|
||||
num = match.group(1).strip()
|
||||
title = match.group(2).strip()
|
||||
subnum = 1
|
||||
LOG.info(
|
||||
"Found Criterion %s - %s",
|
||||
match.group(1).strip(),
|
||||
match.group(2).strip(),
|
||||
)
|
||||
needs_lines.append(f" * (R_GDC_{num}){process(title)}")
|
||||
sentences = tokenize.tokenize_sentences(match.group(3).strip())
|
||||
for sent in sentences:
|
||||
needs_lines.append(
|
||||
f" * (R_GDC_{num}_{subnum}){process(sent.strip())}"
|
||||
)
|
||||
subnum += 1
|
||||
else:
|
||||
sentences = tokenize.tokenize_sentences(line)
|
||||
for sent in sentences:
|
||||
needs_lines.append(
|
||||
f" * (R_GDC_{num}_{subnum}){process(sent.strip())}"
|
||||
)
|
||||
subnum += 1
|
||||
|
||||
with open(OUT_PATH / "10-cfr-50-app-a-list.rst", "w") as out_f:
|
||||
out_f.write("\n".join(needs_lines))
|
||||
|
||||
|
||||
def process_10cfr50_app_b():
|
||||
"""
|
||||
Read 10 CFR 50 App B and generate needs data from it, and xlsx.
|
||||
|
||||
Just tokenizing the text straight up works ok.
|
||||
"""
|
||||
with open(TXT_10_CFR_50_APPB) as inp_f:
|
||||
sentences = tokenize.tokenize_sentences(inp_f.read())
|
||||
|
||||
needs_data = []
|
||||
i = 0
|
||||
for s in sentences:
|
||||
if len(s) < 40:
|
||||
LOG.warning(f"Skipping short sentence: `{s}`")
|
||||
continue
|
||||
needs_data.append((f"10 CFR 50 App B Sentence {i}", s))
|
||||
i += 1
|
||||
with (OUT_PATH / "10-cfr-50-app-b.json").open("w") as f:
|
||||
json.dump(
|
||||
needs.data_to_needs(needs_data, prefix="R_APPB_", links=["R_10CFR50_APPB"]),
|
||||
f,
|
||||
indent=4,
|
||||
)
|
||||
|
||||
filtered_titles, filtered_sentences = zip(*needs_data)
|
||||
wb = excel.make_xlsx_from_sentences(filtered_sentences)
|
||||
wb.save(OUT_PATH / "10-cfr-50-app-b.xlsx") # can only write to disk, not stream
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# fetch_10cfr50_app_a()
|
||||
OUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
process_10cfr50_app_a()
|
||||
process_10cfr50_app_b()
|
||||
5
src/nrsk/utils/__init__.py
Normal file
5
src/nrsk/utils/__init__.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
"""General Utils."""
|
||||
from pathlib import Path
|
||||
|
||||
def get_project_root() -> Path:
|
||||
return Path(__file__).parent.parent.parent.parent
|
||||
12
src/nrsk/utils/excel.py
Normal file
12
src/nrsk/utils/excel.py
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
"""Excel utils."""
|
||||
|
||||
from openpyxl import Workbook
|
||||
|
||||
|
||||
def make_xlsx_from_sentences(sentences: list) -> Workbook:
|
||||
"""Convert json list of sentences to xlsx with numbers."""
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
for row_data in enumerate(sentences):
|
||||
ws.append(row_data)
|
||||
return wb
|
||||
53
src/nrsk/utils/forms.py
Normal file
53
src/nrsk/utils/forms.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
"""
|
||||
Utilities for working with Forms.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
import yaml
|
||||
|
||||
PDFTK = "/usr/bin/pdftk"
|
||||
|
||||
|
||||
def load_form_data(file_path):
|
||||
"""Load yaml or json form data intended to be populated into a form template."""
|
||||
with open(file_path, "r") as f:
|
||||
if file_path.endswith(".yaml"):
|
||||
return yaml.safe_load(f)
|
||||
elif file_path.endswith(".json"):
|
||||
import json
|
||||
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def create_fdf(data, fdf_file):
|
||||
"""Create form data in FDF format to fill into a PDF form template."""
|
||||
fdf = "%FDF-1.2\n1 0 obj\n<< /FDF << /Fields ["
|
||||
for key, value in data.items():
|
||||
value = str(value).replace("\n", " ").replace("\r", "")
|
||||
fdf += f"<< /V ({value}) /T ({key}) >>"
|
||||
fdf += "] >> >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF"
|
||||
with open(fdf_file, "w") as f:
|
||||
f.write(fdf)
|
||||
|
||||
|
||||
def fill_pdf(template_pdf, fdf_file, output_pdf):
|
||||
"""Populate a given pdf form template with form data."""
|
||||
temp_fname = output_pdf + ".filled"
|
||||
subprocess.call([PDFTK, template_pdf, "fill_form", fdf_file, "output", temp_fname])
|
||||
# and flatten it
|
||||
subprocess.call([PDFTK, temp_fname, "output", output_pdf, "flatten"])
|
||||
os.remove(temp_fname)
|
||||
|
||||
|
||||
def demo():
|
||||
# Example usage
|
||||
# for checkboxes, use "Yes" and "Off"
|
||||
data = {"Name": "DIF3D", "ExportControlled": "Yes", "Type": "Engineering"}
|
||||
create_fdf(data, "data.fdf")
|
||||
fill_pdf(
|
||||
"software-dedication-form.pdf",
|
||||
"data.fdf",
|
||||
"software-dedication-form-filled.pdf",
|
||||
)
|
||||
37
src/nrsk/utils/needs.py
Normal file
37
src/nrsk/utils/needs.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
"""Utils for building/managing sphinx-needs data."""
|
||||
|
||||
import datetime
|
||||
|
||||
|
||||
def data_to_needs(
|
||||
needs_data: list[str, str],
|
||||
prefix="",
|
||||
type="req",
|
||||
type_name="Requirement",
|
||||
links=None,
|
||||
) -> dict:
|
||||
"""
|
||||
Convert list of (titles, texts) to valid needs data.
|
||||
|
||||
Format from: https://sphinx-needs.readthedocs.io/en/latest/builders.html#format
|
||||
"""
|
||||
now = datetime.datetime.now().isoformat()
|
||||
need_data = {}
|
||||
for i, (title, body) in enumerate(needs_data):
|
||||
id = f"{prefix}{i}"
|
||||
need_data[id] = {
|
||||
"description": body,
|
||||
"id": id,
|
||||
"title": title,
|
||||
"type": type,
|
||||
"type_name": type_name,
|
||||
"links": links or [],
|
||||
"tags": [],
|
||||
}
|
||||
data = {
|
||||
"created": now,
|
||||
"current_version": "1.0",
|
||||
"project": "nuclear reactor starter kit",
|
||||
"versions": {"1.0": {"created": now, "needs": need_data}},
|
||||
}
|
||||
return data
|
||||
15
src/nrsk/utils/tokenize.py
Normal file
15
src/nrsk/utils/tokenize.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
"""
|
||||
Break input text file into sentences.
|
||||
"""
|
||||
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
||||
|
||||
|
||||
def tokenize_sentences(text: str) -> list:
|
||||
tokenizer = PunktSentenceTokenizer()
|
||||
|
||||
tokenizer.train(text)
|
||||
sentences = tokenizer.tokenize(text)
|
||||
|
||||
return sentences
|
||||
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue