|
- """
- batch_selfinstruct_generate.py
-
- run:
- python -m generate_instruction generate_instruction_following_data \
- --output_dir ./ \
- --num_instructions_to_generate 10 \
- --model_name="text-davinci-003" \
- """
- import time
- import json
- import os
- import random
- import re
- import string
- from functools import partial
- from multiprocessing import Pool
-
- import numpy as np
- import tqdm
- from rouge_score import rouge_scorer
- import utils
-
- import fire
-
-
- def encode_prompt(prompt_instructions):
- """Encode multiple prompt instructions into a single string."""
- prompt = open("./prompt.txt").read() + "\n"
-
- for idx, task_dict in enumerate(prompt_instructions):
- (instruction, input, output) = task_dict["instruction"], task_dict["input"], task_dict["output"]
- instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
- input = "<noinput>" if input.lower() == "" else input
- prompt += f"###\n"
- prompt += f"{idx + 1}. Instruction: {instruction}\n"
- prompt += f"{idx + 1}. Input:\n{input}\n"
- prompt += f"{idx + 1}. Output:\n{output}\n"
- prompt += f"###\n"
- prompt += f"{idx + 2}. Instruction:"
- return prompt
-
-
- def post_process_gpt3_response(num_prompt_instructions, response):
- if response is None:
- return []
- raw_instructions = f"{num_prompt_instructions+1}. Instruction:" + response["text"]
- raw_instructions = re.split("###", raw_instructions)
- instructions = []
- for idx, inst in enumerate(raw_instructions):
- # if the decoding stops due to length, the last example is likely truncated so we discard it
- if idx == len(raw_instructions) - 1 and response["finish_reason"] == "length":
- continue
- idx += num_prompt_instructions + 1
- splitted_data = re.split(f"{idx}\.\s+(Instruction|Input|Output):", inst)
- if len(splitted_data) != 7:
- continue
- else:
- inst = splitted_data[2].strip()
- input = splitted_data[4].strip()
- input = "" if input.lower() == "<noinput>" else input
- output = splitted_data[6].strip()
- # filter out too short or too long instructions
- if len(inst.split()) <= 3 or len(inst.split()) > 150:
- continue
- # filter based on keywords that are not suitable for language models.
- blacklist = [
- "image",
- "images",
- "graph",
- "graphs",
- "picture",
- "pictures",
- "file",
- "files",
- "map",
- "maps",
- "draw",
- "plot",
- "go to",
- "video",
- "audio",
- "music",
- "flowchart",
- "diagram",
- ]
- blacklist += []
- if any(find_word_in_string(word, inst) for word in blacklist):
- continue
- # We found that the model tends to add "write a program" to some existing instructions, which lead to a lot of such instructions.
- # And it's a bit comfusing whether the model need to write a program or directly output the result.
- # Here we filter them out.
- # Note this is not a comprehensive filtering for all programming instructions.
- if inst.startswith("Write a program"):
- continue
- # filter those starting with punctuation
- if inst[0] in string.punctuation:
- continue
- # filter those starting with non-english character
- if not inst[0].isascii():
- continue
- instructions.append({"instruction": inst, "input": input, "output": output})
- return instructions
-
-
- def find_word_in_string(w, s):
- return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s)
-
-
- def generate_instruction_following_data(
- output_dir="./",
- seed_tasks_path="./seed_tasks.jsonl",
- num_instructions_to_generate=100,
- model_name="text-davinci-003",
- num_prompt_instructions=3,
- request_batch_size=5,
- temperature=1.0,
- top_p=1.0,
- num_cpus=16,
- ):
- seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")]
- seed_instruction_data = [
- {"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]}
- for t in seed_tasks
- ]
- print(f"Loaded {len(seed_instruction_data)} human-written seed instructions")
-
- os.makedirs(output_dir, exist_ok=True)
- request_idx = 0
- # load the LM-generated instructions
- machine_instruction_data = []
- if os.path.exists(os.path.join(output_dir, "regen.json")):
- machine_instruction_data = utils.jload(os.path.join(output_dir, "regen.json"))
- print(f"Loaded {len(machine_instruction_data)} machine-generated instructions")
-
- # similarities = {}
- scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)
-
- # now let's generate new instructions!
- progress_bar = tqdm.tqdm(total=num_instructions_to_generate)
- if machine_instruction_data:
- progress_bar.update(len(machine_instruction_data))
-
- # first we tokenize all the seed instructions and generated machine instructions
- all_instructions = [d["instruction"] for d in seed_instruction_data] + [
- d["instruction"] for d in machine_instruction_data
- ]
- all_instruction_tokens = [scorer._tokenizer.tokenize(inst) for inst in all_instructions]
-
- while len(machine_instruction_data) < num_instructions_to_generate:
- request_idx += 1
-
- batch_inputs = []
- for _ in range(request_batch_size):
- # only sampling from the seed tasks
- prompt_instructions = random.sample(seed_instruction_data, num_prompt_instructions)
- prompt = encode_prompt(prompt_instructions)
- batch_inputs.append(prompt)
- decoding_args = utils.OpenAIDecodingArguments(
- temperature=temperature,
- n=1,
- max_tokens=3072, # hard-code to maximize the length. the requests will be automatically adjusted
- top_p=top_p,
- stop=["\n20", "20.", "20."],
- )
- request_start = time.time()
- results = utils.openai_completion(
- prompts=batch_inputs,
- model_name=model_name,
- batch_size=request_batch_size,
- decoding_args=decoding_args,
- logit_bias={"50256": -100}, # prevent the <|endoftext|> token from being generated
- )
- request_duration = time.time() - request_start
-
- process_start = time.time()
- instruction_data = []
- for result in results:
- new_instructions = post_process_gpt3_response(num_prompt_instructions, result)
- instruction_data += new_instructions
-
- total = len(instruction_data)
- keep = 0
- for instruction_data_entry in instruction_data:
- # computing similarity with the pre-tokenzied instructions
- new_instruction_tokens = scorer._tokenizer.tokenize(instruction_data_entry["instruction"])
- with Pool(num_cpus) as p:
- rouge_scores = p.map(
- partial(rouge_scorer._score_lcs, new_instruction_tokens),
- all_instruction_tokens,
- )
- rouge_scores = [score.fmeasure for score in rouge_scores]
- most_similar_instructions = {
- all_instructions[i]: rouge_scores[i] for i in np.argsort(rouge_scores)[-10:][::-1]
- }
- if max(rouge_scores) > 0.7:
- continue
- else:
- keep += 1
- instruction_data_entry["most_similar_instructions"] = most_similar_instructions
- instruction_data_entry["avg_similarity_score"] = float(np.mean(rouge_scores))
- machine_instruction_data.append(instruction_data_entry)
- all_instructions.append(instruction_data_entry["instruction"])
- all_instruction_tokens.append(new_instruction_tokens)
- progress_bar.update(1)
- process_duration = time.time() - process_start
- print(f"Request {request_idx} took {request_duration:.2f}s, processing took {process_duration:.2f}s")
- print(f"Generated {total} instructions, kept {keep} instructions")
- utils.jdump(machine_instruction_data, os.path.join(output_dir, "regen.json"))
-
-
- def main(task, **kwargs):
- globals()[task](**kwargs)
-
-
- if __name__ == "__main__":
- fire.Fire(main)
|