Cluster computing

When we discussed an end-to-end GitHub Actions Workflow that demonstrates the creation and usage of “Issues Dashboard” for periodic insights into the customer experience with the repository assets yesterday, we didn't bring up some of the pesky and troublesome tasks such as finding all pull request references on a GitHub issue regardless of whether they appear in comments or events of type “mentioned” or “cross-referenced” or whether are in the closure of the issue. This provides those details in the sample python implementation below.

Python now follows:

import os, re, json, datetime, requests

import hcl2

import pandas as pd

REPO = os.environ["GITHUB_REPOSITORY"]

GH_TOKEN = os.environ["GH_TOKEN"]

HEADERS = {"Authorization": f"Bearer {GH_TOKEN}", "Accept": "application/vnd.github+json, application/vnd.github.mockingbird-preview+json", "X-GitHub-Api-Version": "2026-03-10"}

# ---- Time window (last 7 days) ----

since = (datetime.datetime.utcnow() - datetime.timedelta(days=7)).isoformat() + "Z"

# ---- Helpers ----

def gh_get(url, params=None):

r = requests.get(url, headers=HEADERS, params=params)

r.raise_for_status()

return r.json()

def gh_get_text(url):

r = requests.get(url, headers=HEADERS)

r.raise_for_status()

return r.text

def list_closed_issues():

# Issues API returns both issues and PRs; filter out PRs.

url = f"https://api.github.com/repos/{REPO}/issues"

items = gh_get(url, params={"state":"closed","since":since,"per_page":100})

return [i for i in items if "pull_request" not in i]

PR_HTML_URL_RE = re.compile(

r"https?://github\.com/(?P<owner>[^/\s]+)/(?P<repo>[^/\s]+)/pull/(?P<num>\d+)",

re.IGNORECASE,

)

PR_API_URL_RE = re.compile(

r"https?://api\.github\.com/repos/(?P<owner>[^/\s]+)/(?P<repo>[^/\s]+)/pulls/(?P<num>\d+)",

re.IGNORECASE,

)

# Shorthand references that might appear in text:

# - #123 (assumed to be same repo)

# - owner/repo#123 (explicit cross-repo)

SHORTHAND_SAME_REPO_RE = re.compile(r"(?<!\w)#(?P<num>\d+)\b")

SHORTHAND_CROSS_REPO_RE = re.compile(

r"(?P<owner>[A-Za-z0-9_.-]+)/(?P<repo>[A-Za-z0-9_.-]+)#(?P<num>\d+)\b"

)

def _normalize_html_pr_url(owner: str, repo: str, num: int) -> str:

return f"https://github.com/{owner}/{repo}/pull/{int(num)}"

def _collect_from_text(text: str, default_owner: str, default_repo: str) -> set:

"""Extract candidate PR URLs from free text (body/comments/events text)."""

found = set()

if not text:

return found

# 1) Direct HTML PR URLs

for m in PR_HTML_URL_RE.finditer(text):

found.add(_normalize_html_pr_url(m.group("owner"), m.group("repo"), m.group("num")))

# 2) API PR URLs -> convert to HTML

for m in PR_API_URL_RE.finditer(text):

found.add(_normalize_html_pr_url(m.group("owner"), m.group("repo"), m.group("num")))

# 3) Cross-repo shorthand: owner/repo#123 (we will treat it as PR URL candidate)

for m in SHORTHAND_CROSS_REPO_RE.finditer(text):

found.add(_normalize_html_pr_url(m.group("owner"), m.group("repo"), m.group("num")))

# 4) Same-repo shorthand: #123

for m in SHORTHAND_SAME_REPO_RE.finditer(text):

found.add(_normalize_html_pr_url(default_owner, default_repo, m.group("num")))

return found

def _paginate_gh_get(url, headers=None, per_page=100):

"""Generator: fetch all pages until fewer than per_page are returned."""

page = 1

while True:

data = gh_get(url, params={"per_page": per_page, "page": page})

if not isinstance(data, list) or len(data) == 0:

break

for item in data:

yield item

if len(data) < per_page:

break

page += 1

def extract_pr_urls_from_issue(issue_number: int):

"""

Extract PR URLs associated with an issue by scanning:

- Issue body

- Issue comments

- Issue events (including 'mentioned', 'cross-referenced', etc.)

- Issue timeline (most reliable for cross references)

Returns a sorted list of unique, normalized HTML PR URLs.

Requires:

- REPO = "owner/repo"

- gh_get(url, params=None, headers=None) is available

"""

owner, repo = REPO.split("/", 1)

pr_urls = set()

# Baseline Accept header for REST v3 + timeline support.

# The timeline historically required a preview header. Keep both for compatibility.

base_headers = {

"Accept": "application/vnd.github+json, application/vnd.github.mockingbird-preview+json"

}

# 1) Issue body

issue_url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}"

issue = gh_get(issue_url)

if isinstance(issue, dict):

body = issue.get("body") or ""

pr_urls |= _collect_from_text(body, owner, repo)

# If this issue IS itself a PR (when called with a PR number), make sure we don't add itself erroneously

# We won't add unless text contains it anyway; still fine.

# 2) All comments

comments_url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments"

for c in _paginate_gh_get(comments_url):

body = c.get("body") or ""

pr_urls |= _collect_from_text(body, owner, repo)

# 3) Issue events (event stream can have 'mentioned', 'cross-referenced', etc.)

events_url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/events"

for ev in _paginate_gh_get(events_url):

# (a) Free-text fields: some events carry body/commit messages, etc.

if isinstance(ev, dict):

body = ev.get("body") or ""

pr_urls |= _collect_from_text(body, owner, repo)

# (b) Structured cross-reference (best: 'cross-referenced' events)

# If the source.issue has 'pull_request' key, it's a PR; use its html_url.

if ev.get("event") == "cross-referenced":

src = ev.get("source") or {}

issue_obj = src.get("issue") or {}

pr_obj = issue_obj.get("pull_request") or {}

html_url = issue_obj.get("html_url")

if pr_obj and html_url and "/pull/" in html_url:

pr_urls.add(html_url)

# Fallback: If not marked but looks like a PR in URL

elif html_url and "/pull/" in html_url:

pr_urls.add(html_url)

# (c) Also include 'mentioned' events (broadened): inspect whatever text fields exist

# Already covered via 'body' text extraction

# 4) Timeline API (the most complete for references)

timeline_url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/timeline"

for item in _paginate_gh_get(timeline_url):

if not isinstance(item, dict):

continue

# Free-text scan on any plausible string field

for key in ("body", "message", "title", "commit_message", "subject"):

val = item.get(key)

if isinstance(val, str):

pr_urls |= _collect_from_text(val, owner, repo)

# Structured cross-reference payloads

if item.get("event") == "cross-referenced":

src = item.get("source") or {}

issue_obj = src.get("issue") or {}

pr_obj = issue_obj.get("pull_request") or {}

html_url = issue_obj.get("html_url")

if pr_obj and html_url and "/pull/" in html_url:

pr_urls.add(html_url)

elif html_url and "/pull/" in html_url:

pr_urls.add(html_url)

# Some timeline items are themselves issues/PRs with html_url

html_url = item.get("html_url")

if isinstance(html_url, str) and "/pull/" in html_url:

pr_urls.add(html_url)

# Occasionally the timeline includes API-style URLs

api_url = item.get("url")

if isinstance(api_url, str):

m = PR_API_URL_RE.search(api_url)

if m:

pr_urls.add(_normalize_html_pr_url(m.group("owner"), m.group("repo"), m.group("num")))

# Final normalization: keep only HTML PR URLs and sort

pr_urls = {m.group(0) for url in pr_urls for m in [PR_HTML_URL_RE.search(url)] if m}

return sorted(pr_urls)

def pr_number_from_url(u):

m = re.search(r"/pull/(\d+)", u)

return int(m.group(1)) if m else None

#codingexercise: CodingExercise-03-17-2026.pdf

Cluster computing

Tuesday, March 17, 2026

No comments:

Post a Comment