diff --git a/scripts/release_notes/classify_prs.py b/scripts/release_notes/classify_prs.py new file mode 100644 index 00000000000..580e93bfe8b --- /dev/null +++ b/scripts/release_notes/classify_prs.py @@ -0,0 +1,121 @@ +# In[1]: + + +import pandas as pd + + +# In[2]: + + +df = pd.read_json("10.0_to_11.0-rc2.json").T +df.tail() + + +# In[3]: + + +all_labels = set(lbl for labels in df["labels"] for lbl in labels) +all_labels + + +# In[4]: + + +# Add one column per label +for label in all_labels: + df[label] = df["labels"].apply(lambda labels_list: label in labels_list) +df.head() + + +# In[5]: + + +# Add a clean "module" column. It contains tuples since PRs can have more than one module. +# Maybe we should include "topics" in that column as well? + +all_modules = { # mapping: full name -> clean name + label: "".join(label.split(" ")[1:]) for label in all_labels if label.startswith("module") +} + +# We use an ugly loop, but whatever ¯\_(ツ)_/¯ +df["module"] = [[] for _ in range(len(df))] +for i, row in df.iterrows(): + for full_name, clean_name in all_modules.items(): + if full_name in row["labels"]: + row["module"].append(clean_name) +df["module"] = df.module.apply(tuple) +df.head() + + +# In[6]: + + +mod_df = df.set_index("module").sort_index() +mod_df.tail() + + +# In[7]: + + +# All improvement PRs +mod_df[mod_df["enhancement"]].head() + + +# In[8]: + + +# improvement f module +# note: don't filter module name on the index as the index contain tuples with non-exclusive values +# Use the boolean column instead +mod_df[mod_df["enhancement"] & mod_df["module: transforms"]] + + +# In[9]: + + +def format_prs(mod_df): + out = [] + for idx, row in mod_df.iterrows(): + modules = idx + # Put "documentation" and "tests" first for sorting to be dece + for last_module in ("documentation", "tests"): + if last_module in modules: + modules = [m for m in modules if m != last_module] + [last_module] + + module = f"[{', '.join(modules)}]" + module = module.replace("referencescripts", "reference scripts") + module = module.replace("code", "reference scripts") + out.append(f"{module} {row['title']}") + + return "\n".join(out) + + +# In[10]: + + +included_prs = pd.DataFrame() + +# If labels are accurate, this shouhld generate most of the release notes already +# We keep track of the included PRs to figure out which ones are missing +for section_title, module_idx in ( + ("Backward-incompatible changes", "bc-breaking"), + ("Deprecations", "deprecation"), + ("New Features", "new feature"), + ("Improvements", "enhancement"), + ("Bug Fixes", "bug"), + ("Code Quality", "code quality"), +): + print(f"## {section_title}") + print() + tmp_df = mod_df[mod_df[module_idx]] + included_prs = pd.concat([included_prs, tmp_df]) + print(format_prs(tmp_df)) + print() + + +# In[11]: + + +# Missing PRs are these ones... classify them manually +missing_prs = pd.concat([mod_df, included_prs]).drop_duplicates(subset="pr_number", keep=False) +print(format_prs(missing_prs)) diff --git a/scripts/release_notes/retrieve_prs_data.py b/scripts/release_notes/retrieve_prs_data.py new file mode 100644 index 00000000000..90cb4cda07e --- /dev/null +++ b/scripts/release_notes/retrieve_prs_data.py @@ -0,0 +1,212 @@ +import json +import locale +import os +import re +import subprocess +from collections import namedtuple +from os.path import expanduser + +import requests + + +Features = namedtuple( + "Features", + [ + "title", + "body", + "pr_number", + "files_changed", + "labels", + ], +) + + +def dict_to_features(dct): + return Features( + title=dct["title"], + body=dct["body"], + pr_number=dct["pr_number"], + files_changed=dct["files_changed"], + labels=dct["labels"], + ) + + +def features_to_dict(features): + return dict(features._asdict()) + + +def run(command): + """Returns (return-code, stdout, stderr)""" + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + output, err = p.communicate() + rc = p.returncode + enc = locale.getpreferredencoding() + output = output.decode(enc) + err = err.decode(enc) + return rc, output.strip(), err.strip() + + +def commit_body(commit_hash): + cmd = f"git log -n 1 --pretty=format:%b {commit_hash}" + ret, out, err = run(cmd) + return out if ret == 0 else None + + +def commit_title(commit_hash): + cmd = f"git log -n 1 --pretty=format:%s {commit_hash}" + ret, out, err = run(cmd) + return out if ret == 0 else None + + +def commit_files_changed(commit_hash): + cmd = f"git diff-tree --no-commit-id --name-only -r {commit_hash}" + ret, out, err = run(cmd) + return out.split("\n") if ret == 0 else None + + +def parse_pr_number(body, commit_hash, title): + regex = r"(#[0-9]+)" + matches = re.findall(regex, title) + if len(matches) == 0: + if "revert" not in title.lower() and "updating submodules" not in title.lower(): + print(f"[{commit_hash}: {title}] Could not parse PR number, ignoring PR") + return None + if len(matches) > 1: + print(f"[{commit_hash}: {title}] Got two PR numbers, using the last one") + return matches[-1][1:] + return matches[0][1:] + + +def get_ghstack_token(): + pattern = "github_oauth = (.*)" + with open(expanduser("~/.ghstackrc"), "r+") as f: + config = f.read() + matches = re.findall(pattern, config) + if len(matches) == 0: + raise RuntimeError("Can't find a github oauth token") + return matches[0] + + +token = get_ghstack_token() +headers = {"Authorization": f"token {token}"} + + +def run_query(query): + request = requests.post("https://api.github.com/graphql", json={"query": query}, headers=headers) + if request.status_code == 200: + return request.json() + else: + raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query)) + + +def gh_labels(pr_number): + query = f""" + {{ + repository(owner: "pytorch", name: "vision") {{ + pullRequest(number: {pr_number}) {{ + labels(first: 10) {{ + edges {{ + node {{ + name + }} + }} + }} + }} + }} + }} + """ + query = run_query(query) + edges = query["data"]["repository"]["pullRequest"]["labels"]["edges"] + return [edge["node"]["name"] for edge in edges] + + +def get_features(commit_hash, return_dict=False): + title, body, files_changed = ( + commit_title(commit_hash), + commit_body(commit_hash), + commit_files_changed(commit_hash), + ) + pr_number = parse_pr_number(body, commit_hash, title) + labels = [] + if pr_number is not None: + labels = gh_labels(pr_number) + result = Features(title, body, pr_number, files_changed, labels) + if return_dict: + return features_to_dict(result) + return result + + +class CommitDataCache: + def __init__(self, path="results/data.json"): + self.path = path + self.data = {} + if os.path.exists(path): + self.data = self.read_from_disk() + + def get(self, commit): + if commit not in self.data.keys(): + # Fetch and cache the data + self.data[commit] = get_features(commit) + self.write_to_disk() + return self.data[commit] + + def read_from_disk(self): + with open(self.path, "r") as f: + data = json.load(f) + data = {commit: dict_to_features(dct) for commit, dct in data.items()} + return data + + def write_to_disk(self): + data = {commit: features._asdict() for commit, features in self.data.items()} + with open(self.path, "w") as f: + json.dump(data, f) + + +def get_commits_between(base_version, new_version): + cmd = f"git merge-base {base_version} {new_version}" + rc, merge_base, _ = run(cmd) + assert rc == 0 + + # Returns a list of something like + # b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555) + cmd = f"git log --reverse --oneline {merge_base}..{new_version}" + rc, commits, _ = run(cmd) + assert rc == 0 + + log_lines = commits.split("\n") + hashes, titles = zip(*[log_line.split(" ", 1) for log_line in log_lines]) + return hashes, titles + + +def convert_to_dataframes(feature_list): + import pandas as pd + + df = pd.DataFrame.from_records(feature_list, columns=Features._fields) + return df + + +def main(base_version, new_version): + hashes, titles = get_commits_between(base_version, new_version) + + cdc = CommitDataCache("data.json") + for idx, commit in enumerate(hashes): + if idx % 10 == 0: + print(f"{idx} / {len(hashes)}") + cdc.get(commit) + + return cdc + + +if __name__ == "__main__": + # d = get_features('2ab93592529243862ce8ad5b6acf2628ef8d0dc8') + # print(d) + # hashes, titles = get_commits_between("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15") + + # Usage: change the tags below accordingly to the current release, then save the json with + # cdc.write_to_disk(). + # Then you can use classify_prs.py (as a notebook) + # to open the json and generate the release notes semi-automatically. + cdc = main("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15") + from IPython import embed + + embed()