Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add predict.py & analyze.py #737

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
74 changes: 74 additions & 0 deletions experimental/eval/analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import json
import sys
from eval_utils import postprocess_code_lines, remove_comments
from tree_sitter import Language, Parser

def analyze(model, language, file):

lang_path = f"build/{language}-lang-parser.so"

line_match = 0
statement_match = 0
parser = Parser()
if language == "csharp":
parser_language = Language(lang_path, "c_sharp")
else:
parser_language = Language(lang_path, language)
parser.set_language(parser_language)

input_file = f"./data/{model}/{language}/{file}"
output_file = f"./data/{model}/{language}/result_{file}"

with open(output_file, 'w') as fout:
with open(input_file) as fin:
for line in fin:
obj = json.loads(line)
result = {}
prediction = ""

for k in obj.keys():
if k == "prediction":
prediction = str(obj[k])
break
elif k == "error":
break
else:
result[k] = obj[k]

tabby_eval = {}
if file == "line_completion.jsonl":
tabby_eval["raw_prompt"] = obj["prompt"]
else:
tabby_eval["raw_prompt"] = obj["crossfile_context"]["text"] + obj["prompt"]

tabby_eval["prediction"] = prediction

groundtruth = obj["groundtruth"]

tabby_eval["first_line_prediction"] = prediction.split("\n")[0]
tabby_eval["first_line_groundtruth"] = groundtruth.split("\n")[0]
if tabby_eval["first_line_prediction"] == tabby_eval["first_line_groundtruth"]:
tabby_eval["first_line_matched"] = True
line_match += 1
else:
tabby_eval["first_line_matched"] = False

tabby_eval["first_statement_prediction"] = postprocess_code_lines(tabby_eval["raw_prompt"], prediction, parser, language)
tabby_eval["first_statement_groundtruth"] = postprocess_code_lines(tabby_eval["raw_prompt"], groundtruth, parser, language)
if tabby_eval["first_statement_prediction"] == tabby_eval["first_statement_groundtruth"]:
tabby_eval["first_statement_matched"] = True
statement_match += 1
else:
tabby_eval["first_statement_matched"] = False

result["tabby_eval"] = tabby_eval

json.dump(result, fout)
fout.write("\n")

print(f"first line matched: {line_match}")
print(f"first statement matched: {statement_match}")


analyze(sys.argv[1], sys.argv[2], sys.argv[3])

197 changes: 197 additions & 0 deletions experimental/eval/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
from pathlib import Path

import modal
from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method
import os


import asyncio

GPU_CONFIG = gpu.A10G()
#MODEL_ID = os.environ.get("MODEL_ID", "")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove commented out code?

#MODEL_ID = "TabbyML/StarCoder-7B"
MODEL_ID = os.popen("cat /tmp/tabby_model_id").read().strip()
LAUNCH_FLAGS = ["serve", "--model", MODEL_ID, "--port", "8000", "--device", "cuda"]
#print(f'MODEL_ID = `{MODEL_ID}`')

def download_model():
import subprocess
import os
MODEL_ID = os.popen("cat /tmp/tabby_model_id").read().strip()
print(f'MODEL_ID={MODEL_ID}')
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remote model ID

subprocess.run(
[
"/opt/tabby/bin/tabby",
"download",
"--model",
MODEL_ID,
]
)


image = (
Image.from_registry(
"tabbyml/tabby:0.5.5",
add_python="3.11",
)
.dockerfile_commands("ENTRYPOINT []")
.pip_install(
"git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=experimental/eval/tabby-python-client",
"pandas"
)
.copy_local_file(local_path="/tmp/tabby_model_id", remote_path="/tmp/tabby_model_id")
.run_function(download_model, force_build=True)
)

stub = Stub("tabby-" + MODEL_ID.split("/")[-1], image=image)


@stub.cls(
gpu=GPU_CONFIG,
concurrency_limit=10,
allow_concurrent_inputs=4,
container_idle_timeout=60 * 10,
timeout=360,
)
class Model:
def __enter__(self):
import socket
import subprocess, os
import time

from tabby_python_client import Client

my_env = os.environ.copy()
my_env["TABBY_DISABLE_USAGE_COLLECTION"] = "1"
MODEL_ID = os.popen("cat /tmp/tabby_model_id").read().strip()
print(f'MODEL_ID={MODEL_ID}')
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Local model ID

LAUNCH_FLAGS = ["serve", "--model", MODEL_ID, "--port", "8000", "--device", "cuda"]
self.launcher = subprocess.Popen(["/opt/tabby/bin/tabby"] + LAUNCH_FLAGS, env=my_env)
self.client = Client("http://127.0.0.1:8000", timeout=60)

# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
def webserver_ready():
try:
socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
return True
except (socket.timeout, ConnectionRefusedError):
# Check if launcher webserving process has exited.
# If so, a connection can never be made.
retcode = self.launcher.poll()
if retcode is not None:
raise RuntimeError(
f"launcher exited unexpectedly with code {retcode}"
)
return False

while not webserver_ready():
time.sleep(1.0)

print("Tabby server ready!")

def __exit__(self, _exc_type, _exc_value, _traceback):
self.launcher.terminate()

@method()
async def health(self):
from tabby_python_client.api.v1 import health

resp = await health.asyncio(client=self.client)
return resp.to_dict()

@method()
async def complete(self, language, crossfile_context, index, row):
from tabby_python_client.api.v1 import completion
from tabby_python_client.models import (
CompletionRequest,
DebugOptions,
CompletionResponse,
Segments,
)
from tabby_python_client.types import Response
from tabby_python_client import errors
import pandas as pd

if 'prediction' in row and not pd.isnull(row['prediction']):
return None, None, None

if crossfile_context:
prompt = row["crossfile_context"]["text"] + row["prompt"]
else:
prompt = row["prompt"]

groundtruth = row["groundtruth"]

request = CompletionRequest(
language=language, debug_options=DebugOptions(raw_prompt=prompt)
)
# resp: CompletionResponse = await completion.asyncio(
# client=self.client, json_body=request
# )
try:
resp: Response = await completion.asyncio_detailed(
client=self.client, json_body=request
)

if resp.parsed != None:
return index, resp.parsed.choices[0].text, None
else:
return index, None, f"<{resp.status_code}>"
except errors.UnexpectedStatus as e:
return index, None, f"error: code={e.status_code} content={e.content} error={e}"
except Exception as e:
return index, None, f"error type: {type(e)}"



@stub.local_entrypoint()
async def main(language, file):
import json
import pandas as pd


print(MODEL_ID)

model = Model()
print("model info:")
health_resp = model.health.remote()
print(health_resp)
assert(health_resp['model'] == MODEL_ID)

whole_path_file = "./data/" + MODEL_ID.split("/")[-1] + "/" + language + "/" + file

if file == 'line_completion.jsonl':
crossfile_context = False
else:
crossfile_context = True

objs = []
with open(whole_path_file) as fin:
for line in fin:
obj = json.loads(line)
objs.append(obj)

df = pd.DataFrame(objs)

outputs = await asyncio.gather(*[model.complete.remote.aio(language, crossfile_context, index, row) for index, row in df.iterrows()])

skipped = 0
success = 0
error = 0

for index, prediction, error_msg in outputs:
if index is None:
skipped += 1
elif prediction is not None:
df.loc[index, 'prediction'] = prediction
success += 1
else:
df.loc[index, 'error'] = error_msg
error += 1
print(f"Skipped {skipped} rows, {success} rows with predictions, {error} rows with errors")

with open(whole_path_file, 'w') as fout:
for index, row in df.iterrows():
json.dump(row.to_dict(), fout)
fout.write('\n')