Sweep: Rerank the search results using fuzzywuzzy. Use src/core/vector_db.py #151

wwzeng1 · 2023-06-29T07:46:48Z

No description provided.

sweep-nightly · 2023-06-29T07:48:29Z

I've started working on the PR for issue #151. The plan is to integrate the fuzzywuzzy library into our search functionality to improve the ranking of search results. This will involve modifying the src/core/vector_db.py file to include fuzzywuzzy and adjust the function that returns search results. I'll also add some tests to ensure the reranking is working as expected.

Give me a minute!

Some code snippets I looked at (click to expand). If some file is missing from here, you can mention the path in the ticket description.

sweep/src/core/sweep_bot.py

Lines 1 to 42 in f130c22

    
           import json 
        
           from loguru import logger 
        
           import github 
        
           from github.Repository import Repository 
        
           from github.ContentFile import ContentFile 
        
           from github.GithubException import GithubException 
        
           import modal 
        
           from pydantic import BaseModel 
        
           from src.core.code_repair import CodeRepairer 
        
           from src.core.entities import ( 
        
               FileChange, 
        
               FileChangeRequest, 
        
               FilesToChange, 
        
               PullRequest, 
        
               RegexMatchError, 
        
               Function, 
        
               Snippet 
        
           ) 
        
           from src.core.chat import ChatGPT 
        
           from src.core.prompts import ( 
        
               files_to_change_prompt, 
        
               pull_request_prompt, 
        
               create_file_prompt, 
        
               modify_file_prompt, 
        
               modify_file_plan_prompt, 
        
               cot_retrieval_prompt 
        
           ) 
        
           from src.utils.constants import DB_NAME 
        
           from src.utils.file_change_functions import modify_file_function, apply_code_edits 
        
           from src.utils.diff import format_contents, generate_diff, generate_new_file, is_markdown 
        
           class CodeGenBot(ChatGPT): 
        
               def get_files_to_change(self): 
        
                   file_change_requests: list[FileChangeRequest] = [] 
        
                   for count in range(5): 
        
                       try: 
        
                           logger.info(f"Generating for the {count}th time...") 
        
                           files_to_change_response = self.chat(files_to_change_prompt, message_key="files_to_change") # Dedup files to change here

sweep/src/slack.py

Lines 325 to 350 in f130c22

    
                   text=f":mag_right: Searching \"{arguments['query']}\" in the codebase...", 
        
                   thread_ts=thread_ts 
        
               ) 
        
               additional_snippets = sweep_bot.search_snippets( 
        
                   arguments["query"], 
        
                   installation_id=installation_id 
        
               ) 
        
               # additional_snippets = default_snippets 
        
               additional_snippets_message = f":mag_right: Found {len(additional_snippets)} additional snippets with the query \"{arguments['query']}\":\n\n" +  "\n".join( 
        
                   f"{snippet.get_slack_link(repo_name)}\n```{snippet.get_preview()}```" for snippet in additional_snippets 
        
               ) 
        
               client.chat_update( 
        
                   channel=request.channel_id, 
        
                   text=additional_snippets_message, 
        
                   ts=search_message["ts"], 
        
               ) 
        
               response = sweep_bot.chat(additional_snippets_message, functions=functions) 
        
           elif name == "create_pr": 
        
               title = arguments["title"] 
        
               summary = arguments["summary"] 
        
               branch = arguments["branch"] 
        
               plan = arguments["plan"] 
        
               plan_message = "\n".join(f"• `{file['file_path']}`: {file['instructions']}" for file in plan) 
        
               plan_message = ">" + plan_message.replace("\n", "\n> ") 
        
               creating_pr_message = client.chat_postMessage(

sweep/tests/test_pr_diffs.py

Lines 51 to 82 in f130c22

    
               snippets, tree = search_snippets( 
        
                               repo, 
        
                               f"{query}", 
        
                               num_files=10, 
        
                               installation_id=36855882, 
        
                           ) 
        
               snippets_text = format_snippets(snippets) 
        
               return snippets_text, tree   
        
           query_to_snippets_fn = stub.function(**FUNCTION_SETTINGS, retries=0)(query_to_snippets_text) 
        
           if __name__ == "__main__": 
        
               access_token = os.environ.get("ACCESS_TOKEN") 
        
               g = Github(access_token) 
        
               repo_name = "sweepai/bot-internal" 
        
               issue_url = "github.com/sweepai/bot-internal/issues/28" 
        
               username = "wwzeng1" 
        
               repo_description = "A repo for Sweep" 
        
               title = "Sweep: Use loguru.info to show the number of tokens in the anthropic call" 
        
               summary = "" 
        
               replies_text = "" 
        
               repo = g.get_repo(repo_name) 
        
               pr = repo.get_pull(339) 
        
               # Temp query 
        
               query = pr.title 
        
               logger.info("Getting PR diffs...") 
        
               diffs = get_pr_diffs(repo, pr) 
        
               with stub.run(): 
        
                   logger.info("Getting snippets...") 
        
                   snippets_text, tree = query_to_snippets_fn.call(query, repo) 
        
               human_message = HumanMessagePromptReview(

sweep/tests/test_review_comments.py

Lines 51 to 82 in f130c22

    
               snippets, tree = search_snippets( 
        
                               repo, 
        
                               f"{query}", 
        
                               num_files=10, 
        
                               installation_id=36855882, 
        
                           ) 
        
               snippets_text = format_snippets(snippets) 
        
               return snippets_text, tree   
        
           query_to_snippets_fn = stub.function(**FUNCTION_SETTINGS, retries=0)(query_to_snippets_text) 
        
           if __name__ == "__main__": 
        
               access_token = os.environ.get("ACCESS_TOKEN") 
        
               g = Github(access_token) 
        
               repo_name = "sweepai/bot-internal" 
        
               issue_url = "github.com/sweepai/bot-internal/issues/28" 
        
               username = "wwzeng1" 
        
               repo_description = "A repo for Sweep" 
        
               title = "Sweep: Use loguru.info to show the number of tokens in the anthropic call" 
        
               summary = "" 
        
               replies_text = "" 
        
               repo = g.get_repo(repo_name) 
        
               pr = repo.get_pull(339) 
        
               # Temp query 
        
               query = pr.title 
        
               logger.info("Getting PR diffs...") 
        
               diffs = get_pr_diffs(repo, pr) 
        
               with stub.run(): 
        
                   logger.info("Getting snippets...") 
        
                   snippets_text, tree = query_to_snippets_fn.call(query, repo) 
        
               human_message = HumanMessagePromptReview(

sweep/src/core/vector_db.py

Lines 1 to 306 in f130c22

    
           import json 
        
           import os 
        
           import re 
        
           import time 
        
           import shutil 
        
           import glob 
        
           from modal import stub 
        
           from loguru import logger 
        
           from redis import Redis 
        
           from tqdm import tqdm 
        
           import modal 
        
           from modal import method 
        
           from deeplake.core.vectorstore.deeplake_vectorstore import DeepLakeVectorStore 
        
           from github import Github 
        
           from git import Repo 
        
           from src.core.entities import Snippet 
        
           from src.utils.event_logger import posthog 
        
           from src.utils.hash import hash_sha256 
        
           from ..utils.github_utils import get_token 
        
           from ..utils.constants import DB_NAME, BOT_TOKEN_NAME, ENV, UTILS_NAME 
        
           from ..utils.config import SweepConfig 
        
           import time 
        
           # TODO: Lots of cleanups can be done here with these constants 
        
           stub = modal.Stub(DB_NAME) 
        
           chunker = modal.Function.lookup(UTILS_NAME, "Chunking.chunk") 
        
           model_volume = modal.SharedVolume().persist(f"{ENV}-storage") 
        
           MODEL_DIR = "/root/cache/model" 
        
           DEEPLAKE_DIR = "/root/cache/" 
        
           DISKCACHE_DIR = "/root/cache/diskcache/" 
        
           DEEPLAKE_FOLDER = "deeplake/" 
        
           BATCH_SIZE = 256 
        
           SENTENCE_TRANSFORMERS_MODEL = "sentence-transformers/all-MiniLM-L12-v2" 
        
           timeout = 60 * 30 # 30 minutes 
        
           image = ( 
        
               modal.Image.debian_slim() 
        
               .apt_install("git") 
        
               .pip_install("deeplake==3.6.3", "sentence-transformers") 
        
               .pip_install("openai", "PyGithub", "loguru", "docarray", "GitPython", "tqdm", "highlight-io", "anthropic", "posthog", "redis", "pyyaml") 
        
           ) 
        
           secrets = [ 
        
               modal.Secret.from_name(BOT_TOKEN_NAME), 
        
               modal.Secret.from_name("openai-secret"), 
        
               modal.Secret.from_name("huggingface"), 
        
               modal.Secret.from_name("chroma-endpoint"), 
        
               modal.Secret.from_name("posthog"), 
        
               modal.Secret.from_name("highlight"), 
        
               modal.Secret.from_name("redis_url"), 
        
               modal.Secret.from_dict({"TRANSFORMERS_CACHE": MODEL_DIR}), 
        
           ] 
        
           def init_deeplake_vs(repo_name): 
        
               deeplake_repo_path = f"mem://{DEEPLAKE_FOLDER}{repo_name}" 
        
               deeplake_vector_store = DeepLakeVectorStore(path = deeplake_repo_path) 
        
               return deeplake_vector_store 
        
           def parse_collection_name(name: str) -> str: 
        
               # Replace any non-alphanumeric characters with hyphens 
        
               name = re.sub(r"[^\w-]", "--", name) 
        
               # Ensure the name is between 3 and 63 characters and starts/ends with alphanumeric 
        
               name = re.sub(r"^(-*\w{0,61}\w)-*$", r"\1", name[:63].ljust(3, "x")) 
        
               return name 
        
           def list_collection_names(): 
        
               """Returns a list of all collection names.""" 
        
               collections = [] 
        
               return collections 
        
           @stub.cls( 
        
               image=image, 
        
               secrets=secrets, 
        
               shared_volumes={MODEL_DIR: model_volume}, 
        
               keep_warm=1, 
        
               gpu="T4", 
        
               retries=modal.Retries(max_retries=5, backoff_coefficient=2, initial_delay=5), 
        
           ) 
        
           class Embedding: 
        
               def __enter__(self): 
        
                   from sentence_transformers import SentenceTransformer 
        
                   self.model = SentenceTransformer( 
        
                       SENTENCE_TRANSFORMERS_MODEL, cache_folder=MODEL_DIR 
        
                   ) 
        
               @method() 
        
               def compute(self, texts: list[str]): 
        
                   return self.model.encode(texts, batch_size=BATCH_SIZE).tolist() 
        
               @method() 
        
               def ping(self): 
        
                   return "pong" 
        
           class ModalEmbeddingFunction(): 
        
               def __init__(self): 
        
                   pass 
        
               def __call__(self, texts): 
        
                   return Embedding.compute.call(texts) 
        
           embedding_function = ModalEmbeddingFunction() 
        
           def get_deeplake_vs_from_repo( 
        
               repo_name: str, 
        
               sweep_config: SweepConfig = SweepConfig(), 
        
               installation_id: int = None, 
        
               branch_name: str = None, 
        
           ): 
        
               logger.info(f"Downloading repository and indexing for {repo_name}...") 
        
               token = get_token(installation_id) 
        
               g = Github(token) 
        
               repo = g.get_repo(repo_name) 
        
               try: 
        
                   labels = repo.get_labels() 
        
                   label_names = [label.name for label in labels] 
        
                   if "sweep" not in label_names: 
        
                       repo.create_label( 
        
                           name="sweep", 
        
                           color="5319E7", 
        
                           description="Assigns Sweep to an issue or pull request.", 
        
                       ) 
        
               except Exception as e: 
        
                   logger.error(f"Received error {e}") 
        
                   logger.warning("Repository already exists, skipping initialization") 
        
               start = time.time() 
        
               logger.info("Recursively getting list of files...") 
        
               repo_url = f"https://x-access-token:{token}@github.com/{repo_name}.git" 
        
               shutil.rmtree("repo", ignore_errors=True) 
        
               Repo.clone_from(repo_url, "repo") 
        
               file_list = glob.iglob("repo/**", recursive=True) 
        
               file_list = [ 
        
                   file 
        
                   for file in tqdm(file_list) 
        
                   if os.path.isfile(file) 
        
                   and all(not file.endswith(ext) for ext in sweep_config.exclude_exts) 
        
                   and all(not file[len("repo/"):].startswith(dir_name) for dir_name in sweep_config.exclude_dirs) 
        
               ] 
        
               branch_name = repo.default_branch 
        
               file_paths = [] 
        
               file_contents = [] 
        
               for file in tqdm(file_list): 
        
                   with open(file, "rb") as f: 
        
                       is_binary = False 
        
                       for block in iter(lambda: f.read(1024), b''): 
        
                           if b'\0' in block: 
        
                               is_binary = True 
        
                               break 
        
                       if is_binary: 
        
                           logger.debug("Skipping binary file...") 
        
                           continue 
        
                   with open(file, "rb") as f: 
        
                       if len(f.read()) > sweep_config.max_file_limit: 
        
                           logger.debug("Skipping large file...") 
        
                           continue 
        
                   with open(file, "r") as f: 
        
                       # Can parallelize this 
        
                       try: 
        
                           contents = f.read() 
        
                           contents = f"Represent this code snippet from {file} for retrieval:\n" + contents 
        
                       except UnicodeDecodeError as e: 
        
                           logger.warning(f"Received warning {e}, skipping...") 
        
                           continue 
        
                       file_path = file[len("repo/") :] 
        
                       file_paths.append(file_path) 
        
                       file_contents.append(contents) 
        
               chunked_results = chunker.map(file_contents, file_paths, kwargs={ 
        
                   "additional_metadata": {"repo_name": repo_name, "branch_name": branch_name} 
        
               }) 
        
               documents, metadatas, ids = zip(*chunked_results) 
        
               documents = [item for sublist in documents for item in sublist] 
        
               metadatas = [item for sublist in metadatas for item in sublist] 
        
               ids = [item for sublist in ids for item in sublist] 
        
               logger.info(f"Used {len(file_paths)} files...") 
        
               shutil.rmtree("repo") 
        
               logger.info(f"Getting list of all files took {time.time() -start}") 
        
               logger.info(f"Received {len(documents)} documents from repository {repo_name}") 
        
               collection_name = parse_collection_name(repo_name) 
        
               deeplake_vs = init_deeplake_vs(collection_name) 
        
               if len(documents) > 0: 
        
                   logger.info("Computing embeddings...") 
        
                   # Check cache here for all documents 
        
                   embeddings = [None] * len(documents) 
        
                   cache_success = True 
        
                   try: 
        
                       cache = Redis.from_url(os.environ.get("redis_url")) 
        
                       logger.info(f"Succesfully got cache for {collection_name}") 
        
                   except: 
        
                       cache_success = False 
        
                   if cache_success: 
        
                       cache_keys = [hash_sha256(doc) + SENTENCE_TRANSFORMERS_MODEL for doc in documents] 
        
                       cache_values = cache.mget(cache_keys) 
        
                       for idx, value in enumerate(cache_values): 
        
                           if value is not None: 
        
                               embeddings[idx] = json.loads(value) 
        
                   logger.info(f"Found {len([x for x in embeddings if x is not None])} embeddings in cache") 
        
                   indices_to_compute = [idx for idx, x in enumerate(embeddings) if x is None] 
        
                   documents_to_compute = [documents[idx] for idx in indices_to_compute] 
        
                   computed_embeddings = embedding_function(documents_to_compute) 
        
                   for idx, embedding in zip(indices_to_compute, computed_embeddings): 
        
                       embeddings[idx] = embedding 
        
                   deeplake_vs.add( 
        
                       text = ids, 
        
                       embedding = embeddings, 
        
                       metadata = metadatas 
        
                   ) 
        
                   if cache_success and len(documents_to_compute) > 0: 
        
                       logger.info(f"Updating cache with {len(computed_embeddings)} embeddings") 
        
                       cache_keys = [hash_sha256(doc) + SENTENCE_TRANSFORMERS_MODEL for doc in documents_to_compute] 
        
                       cache.mset({key: json.dumps(value) for key, value in zip(cache_keys, computed_embeddings)}) 
        
                   return deeplake_vs 
        
               else: 
        
                   logger.error("No documents found in repository") 
        
                   return deeplake_vs 
        
           @stub.function(image=image, secrets=secrets, shared_volumes={DISKCACHE_DIR: model_volume}, timeout=timeout) 
        
           def init_index( 
        
               repo_name: str, 
        
               installation_id: int, 
        
               sweep_config: SweepConfig = SweepConfig(), 
        
           ): 
        
               pass 
        
           @stub.function(image=image, secrets=secrets, shared_volumes={DISKCACHE_DIR: model_volume}, timeout=timeout) 
        
           def update_index( 
        
               repo_name, 
        
               installation_id: int, 
        
               sweep_config: SweepConfig = SweepConfig(), 
        
           ) -> int: 
        
               pass 
        
           @stub.function(image=image, secrets=secrets, shared_volumes={DEEPLAKE_DIR: model_volume}, timeout=timeout) 
        
           def get_relevant_snippets( 
        
               repo_name: str, 
        
               query: str, 
        
               n_results: int, 
        
               installation_id: int, 
        
               username: str = None, 
        
               sweep_config: SweepConfig = SweepConfig(), 
        
           ): 
        
               collection_names = list_collection_names() 
        
               logger.info("DeepLake collections: {}".format(collection_names)) 
        
               collection_name = parse_collection_name(repo_name) 
        
               if collection_name not in collection_names: 
        
                   init_index( 
        
                       repo_name=repo_name, 
        
                       installation_id=installation_id, 
        
                       sweep_config=sweep_config, 
        
                   ) 
        
               deeplake_vs = get_deeplake_vs_from_repo( 
        
                   repo_name=repo_name, installation_id=installation_id, sweep_config=sweep_config 
        
               ) 
        
               results = {"metadata": [], "text": []} 
        
               for n_result in range(n_results, 0, -1): 
        
                   try: 
        
                       query = "Represent this natural language query for code retrieval:\n" + query 
        
                       query_embedding = embedding_function([query])[0] 
        
                       results = deeplake_vs.search(embedding=query_embedding, k=n_result) 
        
                       break 
        
                   except Exception: 
        
                       pass 
        
               if len(results["text"]) == 0: 
        
                   if username is None: 
        
                       username = "anonymous" 
        
                   posthog.capture( 
        
                       username, 
        
                       "failed", 
        
                       { 
        
                           "reason": "Results query was empty", 
        
                           "repo_name": repo_name, 
        
                           "installation_id": installation_id,  
        
                           "query": query,  
        
                           "n_results": n_results 
        
                       }, 
        
                   ) 
        
               metadatas = results["metadata"] 
        
               relevant_paths = [metadata["file_path"] for metadata in metadatas] 
        
               logger.info("Relevant paths: {}".format(relevant_paths)) 
        
               return [ 
        
                   Snippet( 
        
                       content="", 
        
                       start=metadata["start"],  
        
                       end=metadata["end"],  
        
                       file_path=file_path 
        
                   ) for metadata, file_path in zip(metadatas, relevant_paths) 
        
               ]

I'm a bot that handles simple bugs and feature requests but I might make mistakes. Please be kind!

sweep-nightly · 2023-06-29T08:11:28Z

I'm sorry, but it looks like an error has occured. Try removing and re-adding the sweep label. If this error persists contact team@sweep.dev.

I'm a bot that handles simple bugs and feature requests but I might make mistakes. Please be kind!

sweep-nightly bot added the sweep Assigns Sweep to an issue or pull request. label Jun 29, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Sweep: Rerank the search results using fuzzywuzzy. Use src/core/vector_db.py #151

Sweep: Rerank the search results using fuzzywuzzy. Use src/core/vector_db.py #151

wwzeng1 commented Jun 29, 2023

sweep-nightly bot commented Jun 29, 2023

sweep-nightly bot commented Jun 29, 2023

Sweep: Rerank the search results using fuzzywuzzy. Use src/core/vector_db.py #151

Sweep: Rerank the search results using fuzzywuzzy. Use src/core/vector_db.py #151

Comments

wwzeng1 commented Jun 29, 2023

sweep-nightly bot commented Jun 29, 2023

sweep-nightly bot commented Jun 29, 2023