Skip to content

Add scripts for facilitating generating release notes #3973

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Oct 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions scripts/release_notes/classify_prs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# In[1]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the script with these @datumbox , some IDEs like vscode and maybe pycharm understand these as a notebook so the .py file can be used directly as a notebook withing the IDE



import pandas as pd


# In[2]:


df = pd.read_json("10.0_to_11.0-rc2.json").T
df.tail()


# In[3]:


all_labels = set(lbl for labels in df["labels"] for lbl in labels)
all_labels


# In[4]:


# Add one column per label
for label in all_labels:
df[label] = df["labels"].apply(lambda labels_list: label in labels_list)
df.head()


# In[5]:


# Add a clean "module" column. It contains tuples since PRs can have more than one module.
# Maybe we should include "topics" in that column as well?

all_modules = { # mapping: full name -> clean name
label: "".join(label.split(" ")[1:]) for label in all_labels if label.startswith("module")
}

# We use an ugly loop, but whatever ¯\_(ツ)_/¯
df["module"] = [[] for _ in range(len(df))]
for i, row in df.iterrows():
for full_name, clean_name in all_modules.items():
if full_name in row["labels"]:
row["module"].append(clean_name)
df["module"] = df.module.apply(tuple)
df.head()


# In[6]:


mod_df = df.set_index("module").sort_index()
mod_df.tail()


# In[7]:


# All improvement PRs
mod_df[mod_df["enhancement"]].head()


# In[8]:


# improvement f module
# note: don't filter module name on the index as the index contain tuples with non-exclusive values
# Use the boolean column instead
mod_df[mod_df["enhancement"] & mod_df["module: transforms"]]


# In[9]:


def format_prs(mod_df):
out = []
for idx, row in mod_df.iterrows():
modules = idx
# Put "documentation" and "tests" first for sorting to be dece
for last_module in ("documentation", "tests"):
if last_module in modules:
modules = [m for m in modules if m != last_module] + [last_module]

module = f"[{', '.join(modules)}]"
module = module.replace("referencescripts", "reference scripts")
module = module.replace("code", "reference scripts")
out.append(f"{module} {row['title']}")

return "\n".join(out)


# In[10]:


included_prs = pd.DataFrame()

# If labels are accurate, this shouhld generate most of the release notes already
# We keep track of the included PRs to figure out which ones are missing
for section_title, module_idx in (
("Backward-incompatible changes", "bc-breaking"),
("Deprecations", "deprecation"),
("New Features", "new feature"),
("Improvements", "enhancement"),
("Bug Fixes", "bug"),
("Code Quality", "code quality"),
):
print(f"## {section_title}")
print()
tmp_df = mod_df[mod_df[module_idx]]
included_prs = pd.concat([included_prs, tmp_df])
print(format_prs(tmp_df))
print()


# In[11]:


# Missing PRs are these ones... classify them manually
missing_prs = pd.concat([mod_df, included_prs]).drop_duplicates(subset="pr_number", keep=False)
print(format_prs(missing_prs))
212 changes: 212 additions & 0 deletions scripts/release_notes/retrieve_prs_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import json
import locale
import os
import re
import subprocess
from collections import namedtuple
from os.path import expanduser

import requests


Features = namedtuple(
"Features",
[
"title",
"body",
"pr_number",
"files_changed",
"labels",
],
)


def dict_to_features(dct):
return Features(
title=dct["title"],
body=dct["body"],
pr_number=dct["pr_number"],
files_changed=dct["files_changed"],
labels=dct["labels"],
)


def features_to_dict(features):
return dict(features._asdict())


def run(command):
"""Returns (return-code, stdout, stderr)"""
p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
output, err = p.communicate()
rc = p.returncode
enc = locale.getpreferredencoding()
output = output.decode(enc)
err = err.decode(enc)
return rc, output.strip(), err.strip()


def commit_body(commit_hash):
cmd = f"git log -n 1 --pretty=format:%b {commit_hash}"
ret, out, err = run(cmd)
return out if ret == 0 else None


def commit_title(commit_hash):
cmd = f"git log -n 1 --pretty=format:%s {commit_hash}"
ret, out, err = run(cmd)
return out if ret == 0 else None


def commit_files_changed(commit_hash):
cmd = f"git diff-tree --no-commit-id --name-only -r {commit_hash}"
ret, out, err = run(cmd)
return out.split("\n") if ret == 0 else None


def parse_pr_number(body, commit_hash, title):
regex = r"(#[0-9]+)"
matches = re.findall(regex, title)
if len(matches) == 0:
if "revert" not in title.lower() and "updating submodules" not in title.lower():
print(f"[{commit_hash}: {title}] Could not parse PR number, ignoring PR")
return None
if len(matches) > 1:
print(f"[{commit_hash}: {title}] Got two PR numbers, using the last one")
return matches[-1][1:]
return matches[0][1:]


def get_ghstack_token():
pattern = "github_oauth = (.*)"
with open(expanduser("~/.ghstackrc"), "r+") as f:
config = f.read()
matches = re.findall(pattern, config)
if len(matches) == 0:
raise RuntimeError("Can't find a github oauth token")
return matches[0]


token = get_ghstack_token()
headers = {"Authorization": f"token {token}"}


def run_query(query):
request = requests.post("https://api.github.com/graphql", json={"query": query}, headers=headers)
if request.status_code == 200:
return request.json()
else:
raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))


def gh_labels(pr_number):
query = f"""
{{
repository(owner: "pytorch", name: "vision") {{
pullRequest(number: {pr_number}) {{
labels(first: 10) {{
edges {{
node {{
name
}}
}}
}}
}}
}}
}}
"""
query = run_query(query)
edges = query["data"]["repository"]["pullRequest"]["labels"]["edges"]
return [edge["node"]["name"] for edge in edges]


def get_features(commit_hash, return_dict=False):
title, body, files_changed = (
commit_title(commit_hash),
commit_body(commit_hash),
commit_files_changed(commit_hash),
)
pr_number = parse_pr_number(body, commit_hash, title)
labels = []
if pr_number is not None:
labels = gh_labels(pr_number)
result = Features(title, body, pr_number, files_changed, labels)
if return_dict:
return features_to_dict(result)
return result


class CommitDataCache:
def __init__(self, path="results/data.json"):
self.path = path
self.data = {}
if os.path.exists(path):
self.data = self.read_from_disk()

def get(self, commit):
if commit not in self.data.keys():
# Fetch and cache the data
self.data[commit] = get_features(commit)
self.write_to_disk()
return self.data[commit]

def read_from_disk(self):
with open(self.path, "r") as f:
data = json.load(f)
data = {commit: dict_to_features(dct) for commit, dct in data.items()}
return data

def write_to_disk(self):
data = {commit: features._asdict() for commit, features in self.data.items()}
with open(self.path, "w") as f:
json.dump(data, f)


def get_commits_between(base_version, new_version):
cmd = f"git merge-base {base_version} {new_version}"
rc, merge_base, _ = run(cmd)
assert rc == 0

# Returns a list of something like
# b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555)
cmd = f"git log --reverse --oneline {merge_base}..{new_version}"
rc, commits, _ = run(cmd)
assert rc == 0

log_lines = commits.split("\n")
hashes, titles = zip(*[log_line.split(" ", 1) for log_line in log_lines])
return hashes, titles


def convert_to_dataframes(feature_list):
import pandas as pd

df = pd.DataFrame.from_records(feature_list, columns=Features._fields)
return df


def main(base_version, new_version):
hashes, titles = get_commits_between(base_version, new_version)

cdc = CommitDataCache("data.json")
for idx, commit in enumerate(hashes):
if idx % 10 == 0:
print(f"{idx} / {len(hashes)}")
cdc.get(commit)

return cdc


if __name__ == "__main__":
# d = get_features('2ab93592529243862ce8ad5b6acf2628ef8d0dc8')
# print(d)
# hashes, titles = get_commits_between("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")

# Usage: change the tags below accordingly to the current release, then save the json with
# cdc.write_to_disk().
# Then you can use classify_prs.py (as a notebook)
# to open the json and generate the release notes semi-automatically.
cdc = main("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
from IPython import embed

embed()