API Reference

Top Level API Reference

`add_canvas_users()`

Add users from canvas to the course

Source code in ai_ta_backend/main.py

@app.route('/addCanvasUsers', methods=['GET'])
def add_canvas_users():
  """
  Add users from canvas to the course
  """
  print("In /addCanvasUsers")

  canvas = CanvasAPI()
  canvas_course_id: str = request.args.get('course_id')
  course_name: str = request.args.get('course_name')

  success_or_failure = canvas.add_users(canvas_course_id, course_name)

  response = jsonify({"outcome": success_or_failure})

  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

`delete()`

Delete a single file from all our database: S3, Qdrant, and Supabase (for now). Note, of course, we still have parts of that file in our logs.

Source code in ai_ta_backend/main.py

@app.route('/delete', methods=['DELETE'])
def delete():
  """
  Delete a single file from all our database: S3, Qdrant, and Supabase (for now).
  Note, of course, we still have parts of that file in our logs.
  """
  course_name: str = request.args.get('course_name', default='', type=str)
  s3_path: str = request.args.get('s3_path', default='', type=str)
  source_url: str = request.args.get('url', default='', type=str)

  if course_name == '' or (s3_path == '' and source_url == ''):
    # proper web error "400 Bad request"
    abort(
        400,
        description=
        f"Missing one or more required parameters: 'course_name' and ('s3_path' or 'source_url') must be provided. Course name: `{course_name}`, S3 path: `{s3_path}`, source_url: `{source_url}`"
    )

  start_time = time.monotonic()
  ingester = Ingest()
  # background execution of tasks!!
  executor.submit(ingester.delete_data, course_name, s3_path, source_url)
  print(f"From {course_name}, deleted file: {s3_path}")
  print(f"⏰ Runtime of FULL delete func: {(time.monotonic() - start_time):.2f} seconds")
  del ingester

  # we need instant return. Delets are "best effort" assume always successful... sigh :(
  response = jsonify({"outcome": 'success'})
  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

`getAll()`

Get all course materials based on the course_name

Source code in ai_ta_backend/main.py

@app.route('/getAll', methods=['GET'])
def getAll() -> Response:
  """Get all course materials based on the course_name
  """
  course_name: List[str] | str = request.args.get('course_name', default='', type=str)

  if course_name == '':
    # proper web error "400 Bad request"
    abort(
        400,
        description=f"Missing the one required parameter: 'course_name' must be provided. Course name: `{course_name}`")

  ingester = Ingest()
  distinct_dicts = ingester.getAll(course_name)
  del ingester

  response = jsonify({"distinct_files": distinct_dicts})
  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

`getContextStuffedPrompt()`

Get a stuffed prompt for a given user question and course name. Args : search_query (str) course_name (str) : used for metadata filtering Returns : str a very long "stuffed prompt" with question + summaries of 20 most relevant documents.

Source code in ai_ta_backend/main.py

@app.route('/getContextStuffedPrompt', methods=['GET'])
def getContextStuffedPrompt() -> Response:
  """
  Get a stuffed prompt for a given user question and course name.
  Args : 
    search_query (str)
    course_name (str) : used for metadata filtering
  Returns : str
    a very long "stuffed prompt" with question + summaries of 20 most relevant documents.
  """
  print("In /getContextStuffedPrompt")

  ingester = Ingest()
  search_query: str = request.args.get('search_query', default='', type=str)
  course_name: str = request.args.get('course_name', default='', type=str)
  top_n: int = request.args.get('top_n', default=-1, type=int)
  top_k_to_search: int = request.args.get('top_k_to_search', default=-1, type=int)

  if search_query == '' or course_name == '' or top_n == -1 or top_k_to_search == -1:
    # proper web error "400 Bad request"
    abort(
        400,
        description=
        f"Missing one or more required parameters: 'search_query', 'course_name', 'top_n', and 'top_k_to_search' must be provided. Search query: `{search_query}`, Course name: `{course_name}`, Top N: `{top_n}`, Top K to search: `{top_k_to_search}`"
    )

  start_time = time.monotonic()
  stuffed_prompt = ingester.get_context_stuffed_prompt(search_query, course_name, top_n, top_k_to_search)
  print(f"⏰ Runtime of EXTREME prompt stuffing: {(time.monotonic() - start_time):.2f} seconds")
  del ingester

  response = jsonify({"prompt": stuffed_prompt})
  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

`getTopContexts()`

Get most relevant contexts for a given search query.

Return value

GET arguments

course name (optional) str A json response with TBD fields. search_query top_n

Returns

JSON A json response with TBD fields. Metadata fileds * pagenumber_or_timestamp * readable_filename * s3_pdf_path

Example: [ { 'readable_filename': 'Lumetta_notes', 'pagenumber_or_timestamp': 'pg. 19', 's3_pdf_path': '/courses//Lumetta_notes.pdf', 'text': 'In FSM, we do this...' }, ]

Raises

Exception Testing how exceptions are handled.

Source code in ai_ta_backend/main.py

@app.route('/getTopContexts', methods=['GET'])
def getTopContexts() -> Response:
  """Get most relevant contexts for a given search query.

  Return value

  ## GET arguments
  course name (optional) str
      A json response with TBD fields.
  search_query
  top_n

  Returns
  -------
  JSON
      A json response with TBD fields.
  Metadata fileds
  * pagenumber_or_timestamp
  * readable_filename
  * s3_pdf_path

  Example: 
  [
    {
      'readable_filename': 'Lumetta_notes', 
      'pagenumber_or_timestamp': 'pg. 19', 
      's3_pdf_path': '/courses/<course>/Lumetta_notes.pdf', 
      'text': 'In FSM, we do this...'
    }, 
  ]

  Raises
  ------
  Exception
      Testing how exceptions are handled.
  """
  print("In getRopContexts in Main()")
  search_query: str = request.args.get('search_query', default='', type=str)
  course_name: str = request.args.get('course_name', default='', type=str)
  token_limit: int = request.args.get('token_limit', default=3000, type=int)
  if search_query == '' or course_name == '':
    # proper web error "400 Bad request"
    abort(
        400,
        description=
        f"Missing one or more required parameters: 'search_query' and 'course_name' must be provided. Search query: `{search_query}`, Course name: `{course_name}`"
    )

  print("NUM ACTIVE THREADS (top of getTopContexts):", threading.active_count())

  ingester = Ingest()
  found_documents = ingester.getTopContexts(search_query, course_name, token_limit)
  print("NUM ACTIVE THREADS (after instantiating Ingest() class in getTopContexts):", threading.active_count())
  del ingester

  response = jsonify(found_documents)
  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

`getTopContextsWithMQR()`

Get relevant contexts for a given search query, using Multi-query retrieval + filtering method.

Source code in ai_ta_backend/main.py

@app.route('/getTopContextsWithMQR', methods=['GET'])
def getTopContextsWithMQR() -> Response:
  """
  Get relevant contexts for a given search query, using Multi-query retrieval + filtering method.
  """
  search_query: str = request.args.get('search_query', default='', type=str)
  course_name: str = request.args.get('course_name', default='', type=str)
  token_limit: int = request.args.get('token_limit', default=3000, type=int)
  if search_query == '' or course_name == '':
    # proper web error "400 Bad request"
    abort(
        400,
        description=
        f"Missing one or more required parameters: 'search_query' and 'course_name' must be provided. Search query: `{search_query}`, Course name: `{course_name}`"
    )

  posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com')
  posthog.capture('distinct_id_of_the_user',
                  event='filter_top_contexts_invoked',
                  properties={
                      'user_query': search_query,
                      'course_name': course_name,
                      'token_limit': token_limit,
                  })

  ingester = Ingest()
  found_documents = ingester.getTopContextsWithMQR(search_query, course_name, token_limit)
  del ingester
  posthog.shutdown()

  response = jsonify(found_documents)
  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

`get_stuffed_prompt()`

Get most relevant contexts for a given search query.

GET arguments

course name (optional) str A json response with TBD fields. search_query top_n

Returns

String

Source code in ai_ta_backend/main.py

@app.route('/get_stuffed_prompt', methods=['GET'])
def get_stuffed_prompt() -> Response:
  """Get most relevant contexts for a given search query.

  ## GET arguments
  course name (optional) str
      A json response with TBD fields.
  search_query
  top_n

  Returns
  -------
    String

  """
  course_name: str = request.args.get('course_name', default='', type=str)
  search_query: str = request.args.get('search_query', default='', type=str)
  token_limit: int = request.args.get('token_limit', default=-1, type=int)
  if course_name == '' or search_query == '' or token_limit == -1:
    # proper web error "400 Bad request"
    abort(
        400,
        description=
        f"Missing one or more required parameters: 'course_name', 'search_query', and 'token_limit' must be provided. Course name: `{course_name}`, Search query: `{search_query}`, Token limit: `{token_limit}`"
    )

  print("In /getTopContexts: ", search_query)
  if search_query is None:
    return jsonify({"error": "No parameter `search_query` provided. It is undefined."})
  if token_limit is None:
    token_limit = 3_000
  else:
    token_limit = int(token_limit)

  ingester = Ingest()
  prompt = ingester.get_stuffed_prompt(search_query, course_name, token_limit)
  del ingester

  response = jsonify(prompt)
  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

`index()`

summary

Parameters:

Name	Type	Description	Default
`test`	`int`	description. Defaults to 1.	required

Returns:

Name	Type	Description
`JSON`	`Response`	description

Source code in ai_ta_backend/main.py

@app.route('/')
def index() -> Response:
  """_summary_

  Args:
      test (int, optional): _description_. Defaults to 1.

  Returns:
      JSON: _description_
  """
  response = jsonify({"Choo Choo": "Welcome to your Flask app 🚅"})
  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

`ingest()`

Recursively ingests anything from S3 filepath and below. Pass a s3_paths filepath (not URL) into our S3 bucket.

Ingests all files, not just PDFs.

Parameters:

Name	Type	Description	Default
`s3_paths`		str \| List[str]	required

Returns:

Name	Type	Description
`str`	`Response`	Success or Failure message. Failure message if any failures. TODO: email on failure.

Source code in ai_ta_backend/main.py

@app.route('/ingest', methods=['GET'])
def ingest() -> Response:
  """Recursively ingests anything from S3 filepath and below. 
  Pass a s3_paths filepath (not URL) into our S3 bucket.

  Ingests all files, not just PDFs. 

  args:
    s3_paths: str | List[str]

  Returns:
      str: Success or Failure message. Failure message if any failures. TODO: email on failure.
  """
  s3_paths: List[str] | str = request.args.get('s3_paths', default='')
  readable_filename: List[str] | str = request.args.get('readable_filename', default='')
  course_name: List[str] | str = request.args.get('course_name', default='')
  base_url: List[str] | str | None = request.args.get('base_url', default=None)
  url: List[str] | str | None = request.args.get('url', default=None)

  print(
      f"In top of /ingest route. course: {course_name}, s3paths: {s3_paths}, readable_filename: {readable_filename}, base_url: {base_url}, url: {url}"
  )

  if course_name == '' or s3_paths == '':
    # proper web error "400 Bad request"
    abort(
        400,
        description=
        f"Missing one or more required parameters: 'course_name' and 's3_path' must be provided. Course name: `{course_name}`, S3 path: `{s3_paths}`"
    )

  print("NUM ACTIVE THREADS (top of /ingest):", threading.active_count())

  ingester = Ingest()
  if readable_filename == '':
    success_fail_dict = ingester.bulk_ingest(s3_paths, course_name, base_url=base_url, url=url)
  else:
    success_fail_dict = ingester.bulk_ingest(s3_paths,
                                             course_name,
                                             readable_filename=readable_filename,
                                             base_url=base_url,
                                             url=url)
  print(f"Bottom of /ingest route. success or fail dict: {success_fail_dict}")
  del ingester

  response = jsonify(success_fail_dict)
  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

`ingest_canvas()`

Ingest course content from Canvas

Source code in ai_ta_backend/main.py

@app.route('/ingestCanvas', methods=['GET'])
def ingest_canvas():
  """
  Ingest course content from Canvas
  """
  print("made it to ingest")
  canvas = CanvasAPI()
  canvas_course_id: str = request.args.get('course_id')
  course_name: str = request.args.get('course_name')

  # Retrieve the checkbox values from the request and create the content_ingest_dict
  # Set default values to True if not provided in the request
  content_ingest_dict = {
      'files': request.args.get('files', 'true').lower() == 'true',
      'pages': request.args.get('pages', 'true').lower() == 'true',
      'modules': request.args.get('modules', 'true').lower() == 'true',
      'syllabus': request.args.get('syllabus', 'true').lower() == 'true',
      'assignments': request.args.get('assignments', 'true').lower() == 'true',
      'discussions': request.args.get('discussions', 'true').lower() == 'true'
  }

  if canvas_course_id == '' or course_name == '':
    # proper web error "400 Bad request"
    abort(
        400,
        description=
        f"Missing one or more required parameters: 'course_id' and 'course_name' must be provided. course_id: `{canvas_course_id}`, course_name: `{course_name}`"
    )

  success_or_failure = canvas.ingest_course_content(canvas_course_id, course_name, content_ingest_dict)
  response = jsonify({"outcome": success_or_failure})
  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

`ingest_web_text()`

Ingests web text data provided in the POST request body.

Expects JSON data containing

url: The URL of the web text to ingest.
base_url: The base URL of the web text to ingest.
title: The title of the web text.
content: The content of the web text.
course_name: The name of the course associated with the web text.

Returns:

Name	Type	Description
`str`	`Response`	Success or Failure message. Failure message if any failures. TODO: email on failure.

Source code in ai_ta_backend/main.py

@app.route('/ingest-web-text', methods=['POST'])
def ingest_web_text() -> Response:
  """Ingests web text data provided in the POST request body.

  Expects JSON data containing:
    - url: The URL of the web text to ingest.
    - base_url: The base URL of the web text to ingest.
    - title: The title of the web text.
    - content: The content of the web text.
    - course_name: The name of the course associated with the web text.

  Returns:
      str: Success or Failure message. Failure message if any failures. TODO: email on failure.
  """
  data = request.get_json()
  url: str = data.get('url', '')
  base_url: str = data.get('base_url', '')
  title: str = data.get('title', '')
  content: str = data.get('content', '')
  course_name: str = data.get('courseName', '')

  print(f"In top of /ingest-web-text. course: {course_name}, base_url: {base_url}, url: {url}")

  if course_name == '' or url == '' or title == '':
    # proper web error "400 Bad request"
    abort(
        400,
        description=
        f"Missing one or more required parameters: course_name, url or title. Course name: `{course_name}`, url: `{url}`, content: `{content}`, title: `{title}`"
    )

  if content == '':
    print(f"Content is empty. Skipping ingestion of {url}")
    response = jsonify({"outcome": "success"})
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response

  print("NUM ACTIVE THREADS (top of /ingest-web-text):", threading.active_count())

  ingester = Ingest()
  success_fail = ingester.ingest_single_web_text(course_name, base_url, url, content, title)
  del ingester

  print(f"Bottom of /ingest route. success or fail dict: {success_fail}")

  response = jsonify(success_fail)
  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

`mit_download_course()`

Web scraper built for

Source code in ai_ta_backend/main.py

@app.route('/mit-download', methods=['GET'])
def mit_download_course() -> Response:
  """ Web scraper built for 
  """
  url: str = request.args.get('url', default='', type=str)
  course_name: str = request.args.get('course_name', default='', type=str)
  local_dir: str = request.args.get('local_dir', default='', type=str)

  if url == '' or course_name == '' or local_dir == '':
    # proper web error "400 Bad request"
    abort(
        400,
        description=
        f"Missing one or more required parameters: 'url', 'course_name', and 'local_dir' must be provided. url: `{url}`, course_name: `{course_name}`, local_dir: `{local_dir}`"
    )

  success_fail = mit_course_download(url, course_name, local_dir)

  response = jsonify(success_fail)
  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

`resource_report()`

Print server resources.

https://manpages.debian.org/bookworm/manpages-dev/getrlimit.2.en.html

Source code in ai_ta_backend/main.py

@app.route('/resource-report', methods=['GET'])
def resource_report() -> Response:
  """
  Print server resources.
  # https://manpages.debian.org/bookworm/manpages-dev/getrlimit.2.en.html
  """
  import resource
  from resource import getrusage, RUSAGE_SELF, RUSAGE_CHILDREN
  import subprocess

  print("👇👇👇👇👇👇👇👇👇 <RESOURCE REPORT> 👇👇👇👇👇👇👇👇👇")

  print("NUM ACTIVE THREADS (top of /resource-report):", threading.active_count())
  try:
    # result = subprocess.run(['ps', '-u', '$(whoami)', '|', 'wc', '-l'], stdout=subprocess.PIPE)
    result = subprocess.run('ps -u $(whoami) | wc -l', shell=True, stdout=subprocess.PIPE)
    print("Current active threads: ", result.stdout.decode('utf-8'))
  except Exception as e:
    print("Error executing ulimit -a: ", e)

  try:
    with open('/etc/security/limits.conf', 'r') as file:
      print("/etc/security/limits.conf:\n", file.read())
  except Exception as e:
    print("Error reading /etc/security/limits.conf: ", e)

  try:
    with open('/proc/sys/kernel/threads-max', 'r') as file:
      print("/proc/sys/kernel/threads-max: ", file.read())
  except Exception as e:
    print("Error reading /proc/sys/kernel/threads-max: ", e)

  # Check container or virtualization platform limits if applicable
  # This is highly dependent on the specific platform and setup
  # Here is an example for Docker, adjust as needed for your environment
  try:
    result = subprocess.run('docker stats --no-stream', shell=True, stdout=subprocess.PIPE)
    print("Docker stats:\n", result.stdout.decode('utf-8'))
  except Exception as e:
    print("Error getting Docker stats: ", e)

  print("RLIMIT_NPROC: ", resource.getrlimit(resource.RLIMIT_NPROC))
  print("RLIMIT_AS (GB): ", [limit / (1024 * 1024 * 1024) for limit in resource.getrlimit(resource.RLIMIT_AS)])
  print("RLIMIT_DATA (GB): ", [limit / (1024 * 1024 * 1024) for limit in resource.getrlimit(resource.RLIMIT_DATA)])
  print("RLIMIT_MEMLOCK (GB): ",
        [limit / (1024 * 1024 * 1024) for limit in resource.getrlimit(resource.RLIMIT_MEMLOCK)
        ])  # The maximum address space which may be locked in memory.
  print("RLIMIT_STACK (MB): ", [limit / (1024 * 1024) for limit in resource.getrlimit(resource.RLIMIT_STACK)])
  print("getpagesize (MB): ", resource.getpagesize() / (1024 * 1024))

  print("RUSAGE_SELF", getrusage(RUSAGE_SELF), end="\n")
  print("RUSAGE_CHILDREN", getrusage(RUSAGE_CHILDREN), end="\n")

  try:
    result = subprocess.run('ulimit -u', shell=True, stdout=subprocess.PIPE)
    print("ulimit -u: ", result.stdout.decode('utf-8'))
  except Exception as e:
    print("Error executing ulimit -u: ", e)

  try:
    result = subprocess.run('ulimit -a', shell=True, stdout=subprocess.PIPE)
    print(f"ulimit -a:\n{result.stdout.decode('utf-8')}")
  except Exception as e:
    print("Error executing ulimit -a: ", e)

  try:
    print("RUSAGE_THREAD: ", resource.getrlimit(resource.RUSAGE_THREAD))
  except Exception as e:
    pass
    # print("Error in RUSAGE_THREAD: ", e)

  print("👆👆👆👆👆👆👆👆👆 </RESOURCE REPORT> 👆👆👆👆👆👆👆👆👆")

  response = jsonify({"outcome": "success"})
  response.headers.add('Access-Control-Allow-Origin', '*')
  return response

Backend endpoints

Database endpoints (Supabase, QDrant)

`Ingest`

Contains all methods for building and using vector databases.

Source code in ai_ta_backend/vector_database.py

class Ingest():
  """
  Contains all methods for building and using vector databases.
  """

  def __init__(self):
    """
    Initialize AWS S3, Qdrant, and Supabase.
    """
    openai.api_key = os.getenv("OPENAI_API_KEY")

    # vector DB
    self.qdrant_client = QdrantClient(
        url=os.getenv('QDRANT_URL'),
        api_key=os.getenv('QDRANT_API_KEY'),
    )

    self.vectorstore = Qdrant(client=self.qdrant_client,
                              collection_name=os.environ['QDRANT_COLLECTION_NAME'],
                              embeddings=OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE))

    # S3
    self.s3_client = boto3.client(
        's3',
        aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
        aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
    )

    # Create a Supabase client
    self.supabase_client = supabase.create_client(  # type: ignore
        supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY'])

    self.llm = AzureChatOpenAI(
        temperature=0,
        deployment_name=os.getenv('AZURE_OPENAI_ENGINE'),  #type:ignore
        openai_api_base=os.getenv('AZURE_OPENAI_ENDPOINT'),  #type:ignore
        openai_api_key=os.getenv('AZURE_OPENAI_KEY'),  #type:ignore
        openai_api_version=os.getenv('OPENAI_API_VERSION'),  #type:ignore
        openai_api_type=OPENAI_API_TYPE)

    self.posthog = Posthog(sync_mode=True,
                           project_api_key=os.environ['POSTHOG_API_KEY'],
                           host='https://app.posthog.com')

    return None

  def __del__(self):
    # Gracefully shutdown the Posthog client -- this was a main cause of dangling threads.
    # Since I changed Posthog to be sync, no need to shutdown.
    # try:
    #   self.posthog.shutdown()
    # except Exception as e:
    #   print("Failed to shutdown PostHog. Probably fine. Error: ", e)
    try:
      self.qdrant_client.close()
    except Exception as e:
      print("Failed to shutdown Qdrant. Probably fine. Error: ", e)
    try:
      del self.supabase_client
    except Exception as e:
      print("Failed delete supabase_client. Probably fine. Error: ", e)
    try:
      del self.s3_client
    except Exception as e:
      print("Failed to delete s3_client. Probably fine. Error: ", e)

  def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwargs) -> Dict[str, List[str]]:

    def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs):
      """Handle running an arbitrary ingest function for an individual file."""
      # RUN INGEST METHOD
      ret = ingest_method(s3_path, *args, **kwargs)
      if ret == "Success":
        success_status['success_ingest'].append(s3_path)
      else:
        success_status['failure_ingest'].append(s3_path)

    # 👇👇👇👇 ADD NEW INGEST METHODS HERE 👇👇👇👇🎉
    file_ingest_methods = {
        '.html': self._ingest_html,
        '.py': self._ingest_single_py,
        '.pdf': self._ingest_single_pdf,
        '.txt': self._ingest_single_txt,
        '.md': self._ingest_single_txt,
        '.srt': self._ingest_single_srt,
        '.vtt': self._ingest_single_vtt,
        '.docx': self._ingest_single_docx,
        '.ppt': self._ingest_single_ppt,
        '.pptx': self._ingest_single_ppt,
        '.xlsx': self._ingest_single_excel,
        '.xls': self._ingest_single_excel,
        '.csv': self._ingest_single_csv,
        '.png': self._ingest_single_image,
        '.jpg': self._ingest_single_image,
    }

    # Ingest methods via MIME type (more general than filetype)
    mimetype_ingest_methods = {
        'video': self._ingest_single_video,
        'audio': self._ingest_single_video,
        'text': self._ingest_single_txt,
        'image': self._ingest_single_image,
    }
    # 👆👆👆👆 ADD NEW INGEST METHODhe 👆👆👆👆🎉

    print(f"Top of ingest, Course_name {course_name}. S3 paths {s3_paths}")
    success_status = {"success_ingest": [], "failure_ingest": []}
    try:
      if isinstance(s3_paths, str):
        s3_paths = [s3_paths]

      for s3_path in s3_paths:
        file_extension = Path(s3_path).suffix
        with NamedTemporaryFile(suffix=file_extension) as tmpfile:
          self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile)
          mime_type = str(mimetypes.guess_type(tmpfile.name, strict=False)[0])
          mime_category, mime_subcategory = mime_type.split('/')

        if file_extension in file_ingest_methods:
          # Use specialized functions when possible, fallback to mimetype. Else raise error.
          ingest_method = file_ingest_methods[file_extension]
          _ingest_single(ingest_method, s3_path, course_name, **kwargs)
        elif mime_category in mimetype_ingest_methods:
          # fallback to MimeType
          print("mime category", mime_category)
          ingest_method = mimetype_ingest_methods[mime_category]
          _ingest_single(ingest_method, s3_path, course_name, **kwargs)
        else:
          # No supported ingest... Fallback to attempting utf-8 decoding, otherwise fail.
          try:
            self._ingest_single_txt(s3_path, course_name)
            success_status['success_ingest'].append(s3_path)
            print("✅ FALLBACK TO UTF-8 INGEST WAS SUCCESSFUL :) ")
          except Exception as e:
            print(
                f"We don't have a ingest method for this filetype: {file_extension}. As a last-ditch effort, we tried to ingest the file as utf-8 text, but that failed too. File is unsupported: {s3_path}. UTF-8 ingest error: {e}"
            )
            success_status['failure_ingest'].append(
                f"We don't have a ingest method for this filetype: {file_extension} (with generic type {mime_type}), for file: {s3_path}"
            )

      return success_status
    except Exception as e:
      success_status['failure_ingest'].append(f"MAJOR ERROR IN /bulk_ingest: Error: {str(e)}")
      sentry_sdk.capture_exception(e)
      return success_status

  def ingest_single_web_text(self, course_name: str, base_url: str, url: str, content: str, title: str):
    """Crawlee integration
    """
    self.posthog.capture('distinct_id_of_the_user',
                         event='ingest_single_web_text_invoked',
                         properties={
                             'course_name': course_name,
                             'base_url': base_url,
                             'url': url,
                             'content': content,
                             'title': title
                         })
    try:
      # if not, ingest the text
      text = [content]
      metadatas: List[Dict[str, Any]] = [{
          'course_name': course_name,
          's3_path': '',
          'readable_filename': title,
          'pagenumber': '',
          'timestamp': '',
          'url': url,
          'base_url': base_url,
      }]
      self.split_and_upload(texts=text, metadatas=metadatas)
      self.posthog.capture('distinct_id_of_the_user',
                           event='ingest_single_web_text_succeeded',
                           properties={
                               'course_name': course_name,
                               'base_url': base_url,
                               'url': url,
                               'title': title
                           })

      return "Success"
    except Exception as e:
      err = f"❌❌ Error in (web text ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
      )  # type: ignore
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def _ingest_single_py(self, s3_path: str, course_name: str, **kwargs):
    try:
      file_name = s3_path.split("/")[-1]
      file_path = "media/" + file_name  # download from s3 to local folder for ingest

      self.s3_client.download_file(os.getenv('S3_BUCKET_NAME'), s3_path, file_path)

      loader = PythonLoader(file_path)
      documents = loader.load()

      texts = [doc.page_content for doc in documents]

      metadatas: List[Dict[str, Any]] = [{
          'course_name': course_name,
          's3_path': s3_path,
          'readable_filename': kwargs.get('readable_filename',
                                          Path(s3_path).name[37:]),
          'pagenumber': '',
          'timestamp': '',
          'url': '',
          'base_url': '',
      } for doc in documents]
      #print(texts)
      os.remove(file_path)

      success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
      print("Python ingest: ", success_or_failure)
      return success_or_failure

    except Exception as e:
      err = f"❌❌ Error in (Python ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
      )
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def _ingest_single_vtt(self, s3_path: str, course_name: str, **kwargs):
    """
    Ingest a single .vtt file from S3.
    """
    try:
      with NamedTemporaryFile() as tmpfile:
        # download from S3 into vtt_tmpfile
        self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile)
        loader = TextLoader(tmpfile.name)
        documents = loader.load()
        texts = [doc.page_content for doc in documents]

        metadatas: List[Dict[str, Any]] = [{
            'course_name': course_name,
            's3_path': s3_path,
            'readable_filename': kwargs.get('readable_filename',
                                            Path(s3_path).name[37:]),
            'pagenumber': '',
            'timestamp': '',
            'url': '',
            'base_url': '',
        } for doc in documents]

        success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
        return success_or_failure
    except Exception as e:
      err = f"❌❌ Error in (VTT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
      )
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str:
    print(f"IN _ingest_html s3_path `{s3_path}` kwargs: {kwargs}")
    try:
      response = self.s3_client.get_object(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path)
      raw_html = response['Body'].read().decode('utf-8')

      soup = BeautifulSoup(raw_html, 'html.parser')
      title = s3_path.replace("courses/" + course_name, "")
      title = title.replace(".html", "")
      title = title.replace("_", " ")
      title = title.replace("/", " ")
      title = title.strip()
      title = title[37:]  # removing the uuid prefix
      text = [soup.get_text()]

      metadata: List[Dict[str, Any]] = [{
          'course_name': course_name,
          's3_path': s3_path,
          'readable_filename': str(title),  # adding str to avoid error: unhashable type 'slice'
          'url': kwargs.get('url', ''),
          'base_url': kwargs.get('base_url', ''),
          'pagenumber': '',
          'timestamp': '',
      }]

      success_or_failure = self.split_and_upload(text, metadata)
      print(f"_ingest_html: {success_or_failure}")
      return success_or_failure
    except Exception as e:
      err: str = f"ERROR IN _ingest_html: {e}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def _ingest_single_video(self, s3_path: str, course_name: str, **kwargs) -> str:
    """
    Ingest a single video file from S3.
    """
    print("Starting ingest video or audio")
    try:
      # check for file extension
      file_ext = Path(s3_path).suffix
      openai.api_key = os.getenv('OPENAI_API_KEY')
      transcript_list = []
      with NamedTemporaryFile(suffix=file_ext) as video_tmpfile:
        # download from S3 into an video tmpfile
        self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=video_tmpfile)
        # extract audio from video tmpfile
        mp4_version = AudioSegment.from_file(video_tmpfile.name, file_ext[1:])

      # save the extracted audio as a temporary webm file
      with NamedTemporaryFile(suffix=".webm", dir="media", delete=False) as webm_tmpfile:
        mp4_version.export(webm_tmpfile, format="webm")

      # check file size
      file_size = os.path.getsize(webm_tmpfile.name)
      # split the audio into 25MB chunks
      if file_size > 26214400:
        # load the webm file into audio object
        full_audio = AudioSegment.from_file(webm_tmpfile.name, "webm")
        file_count = file_size // 26214400 + 1
        split_segment = 35 * 60 * 1000
        start = 0
        count = 0

        while count < file_count:
          with NamedTemporaryFile(suffix=".webm", dir="media", delete=False) as split_tmp:
            if count == file_count - 1:
              # last segment
              audio_chunk = full_audio[start:]
            else:
              audio_chunk = full_audio[start:split_segment]

            audio_chunk.export(split_tmp.name, format="webm")

            # transcribe the split file and store the text in dictionary
            with open(split_tmp.name, "rb") as f:
              transcript = openai.Audio.transcribe("whisper-1", f)
            transcript_list.append(transcript['text'])  # type: ignore
          start += split_segment
          split_segment += split_segment
          count += 1
          os.remove(split_tmp.name)
      else:
        # transcribe the full audio
        with open(webm_tmpfile.name, "rb") as f:
          transcript = openai.Audio.transcribe("whisper-1", f)
        transcript_list.append(transcript['text'])  # type: ignore

      os.remove(webm_tmpfile.name)

      text = [txt for txt in transcript_list]
      metadatas: List[Dict[str, Any]] = [{
          'course_name': course_name,
          's3_path': s3_path,
          'readable_filename': kwargs.get('readable_filename',
                                          Path(s3_path).name[37:]),
          'pagenumber': '',
          'timestamp': text.index(txt),
          'url': '',
          'base_url': '',
      } for txt in text]

      self.split_and_upload(texts=text, metadatas=metadatas)
      return "Success"
    except Exception as e:
      err = f"❌❌ Error in (VIDEO ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
      )
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def _ingest_single_docx(self, s3_path: str, course_name: str, **kwargs) -> str:
    try:
      with NamedTemporaryFile() as tmpfile:
        self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile)

        loader = Docx2txtLoader(tmpfile.name)
        documents = loader.load()

        texts = [doc.page_content for doc in documents]
        metadatas: List[Dict[str, Any]] = [{
            'course_name': course_name,
            's3_path': s3_path,
            'readable_filename': kwargs.get('readable_filename',
                                            Path(s3_path).name[37:]),
            'pagenumber': '',
            'timestamp': '',
            'url': '',
            'base_url': '',
        } for doc in documents]

        self.split_and_upload(texts=texts, metadatas=metadatas)
        return "Success"
    except Exception as e:
      err = f"❌❌ Error in (DOCX ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
      )
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def _ingest_single_srt(self, s3_path: str, course_name: str, **kwargs) -> str:
    try:
      with NamedTemporaryFile() as tmpfile:
        # download from S3 into pdf_tmpfile
        self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile)

        loader = SRTLoader(tmpfile.name)
        documents = loader.load()

        texts = [doc.page_content for doc in documents]
        metadatas: List[Dict[str, Any]] = [{
            'course_name': course_name,
            's3_path': s3_path,
            'readable_filename': kwargs.get('readable_filename',
                                            Path(s3_path).name[37:]),
            'pagenumber': '',
            'timestamp': '',
            'url': '',
            'base_url': '',
        } for doc in documents]

        self.split_and_upload(texts=texts, metadatas=metadatas)
        return "Success"
    except Exception as e:
      err = f"❌❌ Error in (SRT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
      )
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def _ingest_single_excel(self, s3_path: str, course_name: str, **kwargs) -> str:
    try:
      with NamedTemporaryFile() as tmpfile:
        # download from S3 into pdf_tmpfile
        self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile)

        loader = UnstructuredExcelLoader(tmpfile.name, mode="elements")
        # loader = SRTLoader(tmpfile.name)
        documents = loader.load()

        texts = [doc.page_content for doc in documents]
        metadatas: List[Dict[str, Any]] = [{
            'course_name': course_name,
            's3_path': s3_path,
            'readable_filename': kwargs.get('readable_filename',
                                            Path(s3_path).name[37:]),
            'pagenumber': '',
            'timestamp': '',
            'url': '',
            'base_url': '',
        } for doc in documents]

        self.split_and_upload(texts=texts, metadatas=metadatas)
        return "Success"
    except Exception as e:
      err = f"❌❌ Error in (Excel/xlsx ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
      )
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def _ingest_single_image(self, s3_path: str, course_name: str, **kwargs) -> str:
    try:
      with NamedTemporaryFile() as tmpfile:
        # download from S3 into pdf_tmpfile
        self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile)
        """
        # Unstructured image loader makes the install too large (700MB --> 6GB. 3min -> 12 min build times). AND nobody uses it.
        # The "hi_res" strategy will identify the layout of the document using detectron2. "ocr_only" uses pdfminer.six. https://unstructured-io.github.io/unstructured/core/partition.html#partition-image
        loader = UnstructuredImageLoader(tmpfile.name, unstructured_kwargs={'strategy': "ocr_only"})
        documents = loader.load()
        """

        res_str = pytesseract.image_to_string(Image.open(tmpfile.name))
        print("IMAGE PARSING RESULT:", res_str)
        documents = [Document(page_content=res_str)]

        texts = [doc.page_content for doc in documents]
        metadatas: List[Dict[str, Any]] = [{
            'course_name': course_name,
            's3_path': s3_path,
            'readable_filename': kwargs.get('readable_filename',
                                            Path(s3_path).name[37:]),
            'pagenumber': '',
            'timestamp': '',
            'url': '',
            'base_url': '',
        } for doc in documents]

        self.split_and_upload(texts=texts, metadatas=metadatas)
        return "Success"
    except Exception as e:
      err = f"❌❌ Error in (png/jpg ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
      )
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def _ingest_single_csv(self, s3_path: str, course_name: str, **kwargs) -> str:
    try:
      with NamedTemporaryFile() as tmpfile:
        # download from S3 into pdf_tmpfile
        self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile)

        loader = CSVLoader(file_path=tmpfile.name)
        documents = loader.load()

        texts = [doc.page_content for doc in documents]
        metadatas: List[Dict[str, Any]] = [{
            'course_name': course_name,
            's3_path': s3_path,
            'readable_filename': kwargs.get('readable_filename',
                                            Path(s3_path).name[37:]),
            'pagenumber': '',
            'timestamp': '',
            'url': '',
            'base_url': '',
        } for doc in documents]

        self.split_and_upload(texts=texts, metadatas=metadatas)
        return "Success"
    except Exception as e:
      err = f"❌❌ Error in (CSV ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
      )
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
    """
    Both OCR the PDF. And grab the first image as a PNG.
      LangChain `Documents` have .metadata and .page_content attributes.
    Be sure to use TemporaryFile() to avoid memory leaks!
    """
    print("IN PDF ingest: s3_path: ", s3_path, "and kwargs:", kwargs)

    try:
      with NamedTemporaryFile() as pdf_tmpfile:
        # download from S3 into pdf_tmpfile
        self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=pdf_tmpfile)
        ### READ OCR of PDF
        doc = fitz.open(pdf_tmpfile.name)  # type: ignore

        # improve quality of the image
        zoom_x = 2.0  # horizontal zoom
        zoom_y = 2.0  # vertical zoom
        mat = fitz.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension

        pdf_pages_OCRed: List[Dict] = []
        for i, page in enumerate(doc):  # type: ignore

          # UPLOAD FIRST PAGE IMAGE to S3
          if i == 0:
            with NamedTemporaryFile(suffix=".png") as first_page_png:
              pix = page.get_pixmap(matrix=mat)
              pix.save(first_page_png)  # store image as a PNG

              s3_upload_path = str(Path(s3_path)).rsplit('.pdf')[0] + "-pg1-thumb.png"
              first_page_png.seek(0)  # Seek the file pointer back to the beginning
              with open(first_page_png.name, 'rb') as f:
                print("Uploading image png to S3")
                self.s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path)

          # Extract text
          text = page.get_text().encode("utf8").decode("utf8", errors='ignore')  # get plain text (is in UTF-8)
          pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name[37:]))

        metadatas: List[Dict[str, Any]] = [
            {
                'course_name': course_name,
                's3_path': s3_path,
                'pagenumber': page['page_number'] + 1,  # +1 for human indexing
                'timestamp': '',
                'readable_filename': kwargs.get('readable_filename', page['readable_filename']),
                'url': kwargs.get('url', ''),
                'base_url': kwargs.get('base_url', ''),
            } for page in pdf_pages_OCRed
        ]
        pdf_texts = [page['text'] for page in pdf_pages_OCRed]

        success_or_failure = self.split_and_upload(texts=pdf_texts, metadatas=metadatas)
        return success_or_failure
    except Exception as e:
      err = f"❌❌ Error in (PDF ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
      )  # type: ignore
      print(err)
      sentry_sdk.capture_exception(e)
      return err
    return "Success"

  def _ingest_single_txt(self, s3_path: str, course_name: str, **kwargs) -> str:
    """Ingest a single .txt or .md file from S3.
    Args:
        s3_path (str): A path to a .txt file in S3
        course_name (str): The name of the course
    Returns:
        str: "Success" or an error message
    """
    print("In text ingest")
    try:
      # NOTE: slightly different method for .txt files, no need for download. It's part of the 'body'
      response = self.s3_client.get_object(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path)
      print("s3 Resonse:", response)
      text = response['Body'].read().decode('utf-8')
      print("Text from s3:", text)
      text = [text]

      metadatas: List[Dict[str, Any]] = [{
          'course_name': course_name,
          's3_path': s3_path,
          'readable_filename': kwargs.get('readable_filename',
                                          Path(s3_path).name[37:]),
          'pagenumber': '',
          'timestamp': '',
          'url': '',
          'base_url': '',
      }]
      print("Prior to ingest", metadatas)

      success_or_failure = self.split_and_upload(texts=text, metadatas=metadatas)
      return success_or_failure
    except Exception as e:
      err = f"❌❌ Error in (TXT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
      )
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def _ingest_single_ppt(self, s3_path: str, course_name: str, **kwargs) -> str:
    """
    Ingest a single .ppt or .pptx file from S3.
    """
    try:
      with NamedTemporaryFile() as tmpfile:
        # download from S3 into pdf_tmpfile
        #print("in ingest PPTX")
        self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile)

        loader = UnstructuredPowerPointLoader(tmpfile.name)
        documents = loader.load()

        texts = [doc.page_content for doc in documents]
        metadatas: List[Dict[str, Any]] = [{
            'course_name': course_name,
            's3_path': s3_path,
            'readable_filename': kwargs.get('readable_filename',
                                            Path(s3_path).name[37:]),
            'pagenumber': '',
            'timestamp': '',
            'url': '',
            'base_url': '',
        } for doc in documents]

        self.split_and_upload(texts=texts, metadatas=metadatas)
        return "Success"
    except Exception as e:
      err = f"❌❌ Error in (PPTX ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
      )
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def list_files_recursively(self, bucket, prefix):
    all_files = []
    continuation_token = None

    while True:
      list_objects_kwargs = {
          'Bucket': bucket,
          'Prefix': prefix,
      }
      if continuation_token:
        list_objects_kwargs['ContinuationToken'] = continuation_token

      response = self.s3_client.list_objects_v2(**list_objects_kwargs)

      if 'Contents' in response:
        for obj in response['Contents']:
          all_files.append(obj['Key'])

      if response['IsTruncated']:
        continuation_token = response['NextContinuationToken']
      else:
        break

    return all_files

  def ingest_coursera(self, coursera_course_name: str, course_name: str) -> str:
    """ Download all the files from a coursera course and ingest them.

    1. Download the coursera content.
    2. Upload to S3 (so users can view it)
    3. Run everything through the ingest_bulk method.

    Args:
        coursera_course_name (str): The name of the coursera course.
        course_name (str): The name of the course in our system.

    Returns:
        _type_: Success or error message.
    """
    certificate = "-ca 'FVhVoDp5cb-ZaoRr5nNJLYbyjCLz8cGvaXzizqNlQEBsG5wSq7AHScZGAGfC1nI0ehXFvWy1NG8dyuIBF7DLMA.X3cXsDvHcOmSdo3Fyvg27Q.qyGfoo0GOHosTVoSMFy-gc24B-_BIxJtqblTzN5xQWT3hSntTR1DMPgPQKQmfZh_40UaV8oZKKiF15HtZBaLHWLbpEpAgTg3KiTiU1WSdUWueo92tnhz-lcLeLmCQE2y3XpijaN6G4mmgznLGVsVLXb-P3Cibzz0aVeT_lWIJNrCsXrTFh2HzFEhC4FxfTVqS6cRsKVskPpSu8D9EuCQUwJoOJHP_GvcME9-RISBhi46p-Z1IQZAC4qHPDhthIJG4bJqpq8-ZClRL3DFGqOfaiu5y415LJcH--PRRKTBnP7fNWPKhcEK2xoYQLr9RxBVL3pzVPEFyTYtGg6hFIdJcjKOU11AXAnQ-Kw-Gb_wXiHmu63veM6T8N2dEkdqygMre_xMDT5NVaP3xrPbA4eAQjl9yov4tyX4AQWMaCS5OCbGTpMTq2Y4L0Mbz93MHrblM2JL_cBYa59bq7DFK1IgzmOjFhNG266mQlC9juNcEhc'"
    always_use_flags = "-u kastanvday@gmail.com -p hSBsLaF5YM469# --ignore-formats mp4 --subtitle-language en --path ./coursera-dl"

    try:
      subprocess.run(
          f"coursera-dl {always_use_flags} {certificate} {coursera_course_name}",
          check=True,
          shell=True,  # nosec -- reasonable bandit error suppression
          stdout=subprocess.PIPE,
          stderr=subprocess.PIPE)  # capture_output=True,
      dl_results_path = os.path.join('coursera-dl', coursera_course_name)
      s3_paths: Union[List, None] = upload_data_files_to_s3(course_name, dl_results_path)

      if s3_paths is None:
        return "Error: No files found in the coursera-dl directory"

      print("starting bulk ingest")
      start_time = time.monotonic()
      self.bulk_ingest(s3_paths, course_name)
      print("completed bulk ingest")
      print(f"⏰ Runtime: {(time.monotonic() - start_time):.2f} seconds")

      # Cleanup the coursera downloads
      shutil.rmtree(dl_results_path)

      return "Success"
    except Exception as e:
      err: str = f"Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
      print(err)
      return err

  def ingest_github(self, github_url: str, course_name: str) -> str:
    """
    Clones the given GitHub URL and uses Langchain to load data.
    1. Clone the repo
    2. Use Langchain to load the data
    3. Pass to split_and_upload()
    Args:
        github_url (str): The Github Repo URL to be ingested.
        course_name (str): The name of the course in our system.

    Returns:
        _type_: Success or error message.
    """
    try:
      repo_path = "media/cloned_repo"
      repo = Repo.clone_from(github_url, to_path=repo_path, depth=1, clone_submodules=False)
      branch = repo.head.reference

      loader = GitLoader(repo_path="media/cloned_repo", branch=str(branch))
      data = loader.load()
      shutil.rmtree("media/cloned_repo")
      # create metadata for each file in data

      for doc in data:
        texts = doc.page_content
        metadatas: Dict[str, Any] = {
            'course_name': course_name,
            's3_path': '',
            'readable_filename': doc.metadata['file_name'],
            'url': f"{github_url}/blob/main/{doc.metadata['file_path']}",
            'pagenumber': '',
            'timestamp': '',
        }
        self.split_and_upload(texts=[texts], metadatas=[metadatas])
      return "Success"
    except Exception as e:
      err = f"❌❌ Error in (GITHUB ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n{traceback.format_exc()}"
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
    """ This is usually the last step of document ingest. Chunk & upload to Qdrant (and Supabase.. todo).
    Takes in Text and Metadata (from Langchain doc loaders) and splits / uploads to Qdrant.

    good examples here: https://langchain.readthedocs.io/en/latest/modules/utils/combine_docs_examples/textsplitter.html

    Args:
        texts (List[str]): _description_
        metadatas (List[Dict[str, Any]]): _description_
    """
    self.posthog.capture('distinct_id_of_the_user',
                         event='split_and_upload_invoked',
                         properties={
                             'course_name': metadatas[0].get('course_name', None),
                             's3_path': metadatas[0].get('s3_path', None),
                             'readable_filename': metadatas[0].get('readable_filename', None),
                             'url': metadatas[0].get('url', None),
                             'base_url': metadatas[0].get('base_url', None),
                         })

    print("In split and upload")
    print(f"metadatas: {metadatas}")
    print(f"Texts: {texts}")
    assert len(texts) == len(
        metadatas
    ), f'must have equal number of text strings and metadata dicts. len(texts) is {len(texts)}. len(metadatas) is {len(metadatas)}'

    try:
      text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
          chunk_size=1000,
          chunk_overlap=150,
          separators=[
              "\n\n", "\n", ". ", " ", ""
          ]  # try to split on paragraphs... fallback to sentences, then chars, ensure we always fit in context window
      )
      contexts: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas)
      input_texts = [{'input': context.page_content, 'model': 'text-embedding-ada-002'} for context in contexts]

      # check for duplicates
      is_duplicate = self.check_for_duplicates(input_texts, metadatas)
      if is_duplicate:
        self.posthog.capture('distinct_id_of_the_user',
                             event='split_and_upload_succeeded',
                             properties={
                                 'course_name': metadatas[0].get('course_name', None),
                                 's3_path': metadatas[0].get('s3_path', None),
                                 'readable_filename': metadatas[0].get('readable_filename', None),
                                 'url': metadatas[0].get('url', None),
                                 'base_url': metadatas[0].get('base_url', None),
                                 'is_duplicate': True,
                             })
        return "Success"

      # adding chunk index to metadata for parent doc retrieval
      for i, context in enumerate(contexts):
        context.metadata['chunk_index'] = i

      oai = OpenAIAPIProcessor(
          input_prompts_list=input_texts,
          request_url='https://api.openai.com/v1/embeddings',
          api_key=os.getenv('VLADS_OPENAI_KEY'),
          # request_url='https://uiuc-chat-canada-east.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-05-15',
          # api_key=os.getenv('AZURE_OPENAI_KEY'),
          max_requests_per_minute=5_000,
          max_tokens_per_minute=300_000,
          max_attempts=20,
          logging_level=logging.INFO,
          token_encoding_name='cl100k_base')  # nosec -- reasonable bandit error suppression
      asyncio.run(oai.process_api_requests_from_file())
      # parse results into dict of shape page_content -> embedding
      embeddings_dict: dict[str, List[float]] = {
          item[0]['input']: item[1]['data'][0]['embedding'] for item in oai.results
      }

      ### BULK upload to Qdrant ###
      vectors: list[PointStruct] = []
      for context in contexts:
        # !DONE: Updated the payload so each key is top level (no more payload.metadata.course_name. Instead, use payload.course_name), great for creating indexes.
        upload_metadata = {**context.metadata, "page_content": context.page_content}
        vectors.append(
            PointStruct(id=str(uuid.uuid4()), vector=embeddings_dict[context.page_content], payload=upload_metadata))

      self.qdrant_client.upsert(
          collection_name=os.environ['QDRANT_COLLECTION_NAME'],  # type: ignore
          points=vectors  # type: ignore
      )
      ### Supabase SQL ###
      contexts_for_supa = [{
          "text": context.page_content,
          "pagenumber": context.metadata.get('pagenumber'),
          "timestamp": context.metadata.get('timestamp'),
          "chunk_index": context.metadata.get('chunk_index'),
          "embedding": embeddings_dict[context.page_content]
      } for context in contexts]

      document = {
          "course_name": contexts[0].metadata.get('course_name'),
          "s3_path": contexts[0].metadata.get('s3_path'),
          "readable_filename": contexts[0].metadata.get('readable_filename'),
          "url": contexts[0].metadata.get('url'),
          "base_url": contexts[0].metadata.get('base_url'),
          "contexts": contexts_for_supa,
      }

      response = self.supabase_client.table(
          os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute()  # type: ignore

      # add to Nomic document map
      if len(response.data) > 0:
        inserted_data = response.data[0]
        res = log_to_document_map(inserted_data)

      self.posthog.capture('distinct_id_of_the_user',
                           event='split_and_upload_succeeded',
                           properties={
                               'course_name': metadatas[0].get('course_name', None),
                               's3_path': metadatas[0].get('s3_path', None),
                               'readable_filename': metadatas[0].get('readable_filename', None),
                               'url': metadatas[0].get('url', None),
                               'base_url': metadatas[0].get('base_url', None),
                           })
      print("successful END OF split_and_upload")
      return "Success"
    except Exception as e:
      err: str = f"ERROR IN split_and_upload(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def delete_entire_course(self, course_name: str):
    """Delete entire course.

    Delete materials from S3, Supabase SQL, Vercel KV, and QDrant vector DB
    Args:
        course_name (str): _description_
    """
    print(f"Deleting entire course: {course_name}")
    try:
      # Delete file from S3
      print("Deleting from S3")
      objects_to_delete = self.s3_client.list_objects(Bucket=os.getenv('S3_BUCKET_NAME'),
                                                      Prefix=f'courses/{course_name}/')
      for object in objects_to_delete['Contents']:
        self.s3_client.delete_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=object['Key'])
    except Exception as e:
      err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
      print(err)
      sentry_sdk.capture_exception(e)
      pass

    try:
      # Delete from Qdrant
      # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key
      # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18  \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None),
      print("deleting from qdrant")
      self.qdrant_client.delete(
          collection_name=os.environ['QDRANT_COLLECTION_NAME'],
          points_selector=models.Filter(must=[
              models.FieldCondition(
                  key="course_name",
                  match=models.MatchValue(value=course_name),
              ),
          ]),
      )
    except Exception as e:
      err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
      print(err)
      sentry_sdk.capture_exception(e)
      pass

    try:
      # Delete from Supabase
      print("deleting from supabase")
      response = self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(
          'course_name', course_name).execute()
      print("supabase response: ", response)
      return "Success"
    except Exception as e:
      err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
      print(err)
      sentry_sdk.capture_exception(e)
    # todo: delete from Vercel KV to fully make the coure not exist. Last db to delete from (as of now, Aug 15)

  def delete_data(self, course_name: str, s3_path: str, source_url: str):
    """Delete file from S3, Qdrant, and Supabase."""
    print(f"Deleting {s3_path} from S3, Qdrant, and Supabase for course {course_name}")
    # add delete from doc map logic here
    try:
      # Delete file from S3
      bucket_name = os.getenv('S3_BUCKET_NAME')

      # Delete files by S3 path
      if s3_path:
        try:
          self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path)
        except Exception as e:
          print("Error in deleting file from s3:", e)
          sentry_sdk.capture_exception(e)
        # Delete from Qdrant
        # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key
        # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18  \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training  ...
        try:
          self.qdrant_client.delete(
              collection_name=os.environ['QDRANT_COLLECTION_NAME'],
              points_selector=models.Filter(must=[
                  models.FieldCondition(
                      key="s3_path",
                      match=models.MatchValue(value=s3_path),
                  ),
              ]),
          )
        except Exception as e:
          if "timed out" in str(e):
            # Timed out is fine. Still deletes.
            # https://github.com/qdrant/qdrant/issues/3654#issuecomment-1955074525
            pass
          else:
            print("Error in deleting file from Qdrant:", e)
            sentry_sdk.capture_exception(e)
        try:
          # delete from Nomic
          response = self.supabase_client.from_(
              os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq(
                  's3_path', s3_path).eq('course_name', course_name).execute()
          data = response.data[0]  #single record fetched
          nomic_ids_to_delete = []
          context_count = len(data['contexts'])
          for i in range(1, context_count + 1):
            nomic_ids_to_delete.append(str(data['id']) + "_" + str(i))

          # delete from Nomic
          res = delete_from_document_map(course_name, nomic_ids_to_delete)
        except Exception as e:
          print("Error in deleting file from Nomic:", e)
          sentry_sdk.capture_exception(e)

        try:
          self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(
              's3_path', s3_path).eq('course_name', course_name).execute()
        except Exception as e:
          print("Error in deleting file from supabase:", e)
          sentry_sdk.capture_exception(e)

      # Delete files by their URL identifier
      elif source_url:
        try:
          # Delete from Qdrant
          self.qdrant_client.delete(
              collection_name=os.environ['QDRANT_COLLECTION_NAME'],
              points_selector=models.Filter(must=[
                  models.FieldCondition(
                      key="url",
                      match=models.MatchValue(value=source_url),
                  ),
              ]),
          )
        except Exception as e:
          if "timed out" in str(e):
            # Timed out is fine. Still deletes.
            # https://github.com/qdrant/qdrant/issues/3654#issuecomment-1955074525
            pass
          else:
            print("Error in deleting file from Qdrant:", e)
            sentry_sdk.capture_exception(e)
        try:
          # delete from Nomic
          response = self.supabase_client.from_(
              os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, url, contexts").eq(
                  'url', source_url).eq('course_name', course_name).execute()
          data = response.data[0]  #single record fetched
          nomic_ids_to_delete = []
          context_count = len(data['contexts'])
          for i in range(1, context_count + 1):
            nomic_ids_to_delete.append(str(data['id']) + "_" + str(i))

          # delete from Nomic
          res = delete_from_document_map(course_name, nomic_ids_to_delete)
        except Exception as e:
          print("Error in deleting file from Nomic:", e)
          sentry_sdk.capture_exception(e)

        try:
          # delete from Supabase
          self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(
              'url', source_url).eq('course_name', course_name).execute()
        except Exception as e:
          print("Error in deleting file from supabase:", e)
          sentry_sdk.capture_exception(e)

      # Delete from Supabase
      return "Success"
    except Exception as e:
      err: str = f"ERROR IN delete_data: Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def getAll(
      self,
      course_name: str,
  ):
    """Get all course materials based on course name.
    Args:
        course_name (as uploaded on supabase)
    Returns:
        list of dictionaries with distinct s3 path, readable_filename and course_name, url, base_url.
    """

    response = self.supabase_client.table(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select(
        'course_name, s3_path, readable_filename, url, base_url').eq('course_name', course_name).execute()

    data = response.data
    unique_combinations = set()
    distinct_dicts = []

    for item in data:
      combination = (item['s3_path'], item['readable_filename'], item['course_name'], item['url'], item['base_url'])
      if combination not in unique_combinations:
        unique_combinations.add(combination)
        distinct_dicts.append(item)

    return distinct_dicts

  def vector_search(self, search_query, course_name):
    top_n = 80
    # EMBED
    openai_start_time = time.monotonic()
    o = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE)
    user_query_embedding = o.embed_query(search_query)
    openai_embedding_latency = time.monotonic() - openai_start_time

    # SEARCH
    myfilter = models.Filter(must=[
        models.FieldCondition(key='course_name', match=models.MatchValue(value=course_name)),
    ])
    self.posthog.capture('distinct_id_of_the_user',
                         event='vector_search_invoked',
                         properties={
                             'user_query': search_query,
                             'course_name': course_name,
                         })
    qdrant_start_time = time.monotonic()
    search_results = self.qdrant_client.search(
        collection_name=os.environ['QDRANT_COLLECTION_NAME'],
        query_filter=myfilter,
        with_vectors=False,
        query_vector=user_query_embedding,
        limit=top_n,  # Return n closest points

        # In a system with high disk latency, the re-scoring step may become a bottleneck: https://qdrant.tech/documentation/guides/quantization/
        search_params=models.SearchParams(quantization=models.QuantizationSearchParams(rescore=False)))

    found_docs: list[Document] = []
    for d in search_results:
      try:
        metadata = d.payload
        page_content = metadata['page_content']
        del metadata['page_content']
        if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys():  # type: ignore
          # aiding in the database migration...
          metadata["pagenumber"] = metadata["pagenumber_or_timestamp"]  # type: ignore

        found_docs.append(Document(page_content=page_content, metadata=metadata))  # type: ignore
      except Exception as e:
        print(f"Error in vector_search(), for course: `{course_name}`. Error: {e}")
        sentry_sdk.capture_exception(e)

    self.posthog.capture('distinct_id_of_the_user',
                         event='vector_search_succeded',
                         properties={
                             'user_query': search_query,
                             'course_name': course_name,
                             'qdrant_latency_sec': time.monotonic() - qdrant_start_time,
                             'openai_embedding_latency_sec': openai_embedding_latency
                         })
    # print("found_docs", found_docs)
    return found_docs

  def getTopContexts(self, search_query: str, course_name: str, token_limit: int = 4_000) -> Union[List[Dict], str]:
    """Here's a summary of the work.

    /GET arguments
      course name (optional) str: A json response with TBD fields.

    Returns
      JSON: A json response with TBD fields. See main.py:getTopContexts docs.
      or
      String: An error message with traceback.
    """
    try:
      start_time_overall = time.monotonic()

      found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name)

      pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
      # count tokens at start and end, then also count each context.
      token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' +
                                               search_query)  # type: ignore

      valid_docs = []
      num_tokens = 0
      for doc in found_docs:
        doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n"
        num_tokens, prompt_cost = count_tokens_and_cost(doc_string)  # type: ignore

        print(
            f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, total prompt cost (of these contexts): {prompt_cost}. 📄 File: {doc.metadata['readable_filename']}"
        )
        if token_counter + num_tokens <= token_limit:
          token_counter += num_tokens
          valid_docs.append(doc)
        else:
          # filled our token size, time to return
          break

      print(f"Total tokens used: {token_counter}. Docs used: {len(valid_docs)} of {len(found_docs)} docs retrieved")
      print(f"Course: {course_name} ||| search_query: {search_query}")
      print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds")
      if len(valid_docs) == 0:
        return []

      self.posthog.capture('distinct_id_of_the_user',
                           event='success_get_top_contexts_OG',
                           properties={
                               'user_query': search_query,
                               'course_name': course_name,
                               'token_limit': token_limit,
                               'total_tokens_used': token_counter,
                               'total_contexts_used': len(valid_docs),
                               'total_unique_docs_retrieved': len(found_docs),
                               'getTopContext_total_latency_sec': time.monotonic() - start_time_overall,
                           })

      return self.format_for_json(valid_docs)
    except Exception as e:
      # return full traceback to front end
      err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}"  # type: ignore
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def batch_vector_search(self, search_queries: List[str], course_name: str, top_n: int = 50):
    """
    Perform a similarity search for all the generated queries at once.
    """
    start_time = time.monotonic()

    from qdrant_client.http import models as rest
    o = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE)
    # Prepare the filter for the course name
    myfilter = rest.Filter(must=[
        rest.FieldCondition(key='course_name', match=rest.MatchValue(value=course_name)),
    ])

    # Prepare the search requests
    search_requests = []
    for query in search_queries:
      user_query_embedding = o.embed_query(query)
      search_requests.append(
          rest.SearchRequest(vector=user_query_embedding,
                             filter=myfilter,
                             limit=top_n,
                             with_payload=True,
                             params=models.SearchParams(quantization=models.QuantizationSearchParams(rescore=False))))

    # Perform the batch search
    search_results = self.qdrant_client.search_batch(
        collection_name=os.environ['QDRANT_COLLECTION_NAME'],
        requests=search_requests,
    )
    # process search results
    found_docs: list[list[Document]] = []
    for result in search_results:
      docs = []
      for doc in result:
        try:
          metadata = doc.payload
          page_content = metadata['page_content']
          del metadata['page_content']

          if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys():
            metadata["pagenumber"] = metadata["pagenumber_or_timestamp"]

          docs.append(Document(page_content=page_content, metadata=metadata))
        except Exception:
          print(traceback.print_exc())
      found_docs.append(docs)

    print(f"⏰ Qdrant Batch Search runtime: {(time.monotonic() - start_time):.2f} seconds")
    return found_docs

  def reciprocal_rank_fusion(self, results: list[list], k=60):
    """
      Since we have multiple queries, and n documents returned per query, we need to go through all the results
      and collect the documents with the highest overall score, as scored by qdrant similarity matching.
      """
    fused_scores = {}
    count = 0
    unique_count = 0
    for docs in results:
      # Assumes the docs are returned in sorted order of relevance
      count += len(docs)
      for rank, doc in enumerate(docs):
        doc_str = dumps(doc)
        if doc_str not in fused_scores:
          fused_scores[doc_str] = 0
          unique_count += 1
        fused_scores[doc_str] += 1 / (rank + k)
        # Uncomment for debugging
        # previous_score = fused_scores[doc_str]
        #print(f"Change score for doc: {doc_str}, previous score: {previous_score}, updated score: {fused_scores[doc_str]} ")
    print(f"Total number of documents in rank fusion: {count}")
    print(f"Total number of unique documents in rank fusion: {unique_count}")
    reranked_results = [
        (loads(doc), score) for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results

  def getTopContextsWithMQR(self,
                            search_query: str,
                            course_name: str,
                            token_limit: int = 4_000) -> Union[List[Dict], str]:
    """
    New info-retrieval pipeline that uses multi-query retrieval + filtering + reciprocal rank fusion + context padding.
    1. Generate multiple queries based on the input search query.
    2. Retrieve relevant docs for each query.
    3. Filter the relevant docs based on the user query and pass them to the rank fusion step.
    4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score.
    5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document.
    """
    return 'fail'

  #   try:
  #     top_n_per_query = 40  # HARD CODE TO ENSURE WE HIT THE MAX TOKENS
  #     start_time_overall = time.monotonic()
  #     mq_start_time = time.monotonic()

  #     # 1. GENERATE MULTIPLE QUERIES
  #     generate_queries = (
  #         MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) |
  #         (lambda x: list(filter(None, x)))  # filter out non-empty strings
  #     )

  #     generated_queries = generate_queries.invoke({"original_query": search_query})
  #     print("generated_queries", generated_queries)

  #     # 2. VECTOR SEARCH FOR EACH QUERY
  #     batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries,
  #                                                                              course_name=course_name,
  #                                                                              top_n=top_n_per_query)

  #     # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end.
  #     found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested)
  #     found_docs = [doc for doc, score in found_docs]
  #     print(f"Num docs after re-ranking: {len(found_docs)}")
  #     if len(found_docs) == 0:
  #       return []
  #     print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds")

  #     # 4. FILTER DOCS
  #     filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180)
  #     if len(filtered_docs) == 0:
  #       return []

  #     # 5. TOP DOC CONTEXT PADDING // parent document retriever
  #     final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name)
  #     print(f"Number of final docs after context padding: {len(final_docs)}")

  #     pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
  #     token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' +
  #                                              search_query)  # type: ignore

  #     valid_docs = []
  #     num_tokens = 0
  #     for doc in final_docs:
  #       doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n"
  #       num_tokens, prompt_cost = count_tokens_and_cost(doc_string)  # type: ignore

  #       print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}")
  #       if token_counter + num_tokens <= token_limit:
  #         token_counter += num_tokens
  #         valid_docs.append(doc)
  #       else:
  #         # filled our token size, time to return
  #         break

  #     print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.")
  #     print(f"Course: {course_name} ||| search_query: {search_query}")
  #     print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds")

  #     if len(valid_docs) == 0:
  #       return []

  #     self.posthog.capture('distinct_id_of_the_user',
  #                          event='filter_top_contexts_succeeded',
  #                          properties={
  #                              'user_query': search_query,
  #                              'course_name': course_name,
  #                              'token_limit': token_limit,
  #                              'total_tokens_used': token_counter,
  #                              'total_contexts_used': len(valid_docs),
  #                              'total_unique_docs_retrieved': len(found_docs),
  #                          })

  #     return self.format_for_json_mqr(valid_docs)
  #   except Exception as e:
  #     # return full traceback to front end
  #     err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}"  # type: ignore
  #     print(err)
  #     sentry_sdk.capture_exception(e)
  #     return err

  def format_for_json_mqr(self, found_docs) -> List[Dict]:
    """
    Same as format_for_json, but for the new MQR pipeline.
    """
    for found_doc in found_docs:
      if "pagenumber" not in found_doc.keys():
        print("found no pagenumber")
        found_doc['pagenumber'] = found_doc['pagenumber_or_timestamp']

    contexts = [
        {
            'text': doc['text'],
            'readable_filename': doc['readable_filename'],
            'course_name ': doc['course_name'],
            's3_path': doc['s3_path'],
            'pagenumber': doc['pagenumber'],
            'url': doc['url'],  # wouldn't this error out?
            'base_url': doc['base_url'],
        } for doc in found_docs
    ]

    return contexts

  def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n: int, top_k_to_search: int) -> str:
    """
    Get a stuffed prompt for a given user question and course name.
    Args:
      user_question (str)
      course_name (str) : used for metadata filtering
    Returns : str
      a very long "stuffed prompt" with question + summaries of top_n most relevant documents.
    """
    # MMR with metadata filtering based on course_name
    vec_start_time = time.monotonic()
    found_docs = self.vectorstore.max_marginal_relevance_search(user_question, k=top_n, fetch_k=top_k_to_search)
    print(
        f"⏰ MMR Search runtime (top_n_to_keep: {top_n}, top_k_to_search: {top_k_to_search}): {(time.monotonic() - vec_start_time):.2f} seconds"
    )

    requests = []
    for doc in found_docs:
      print("doc", doc)
      dictionary = {
          "model": "gpt-3.5-turbo",
          "messages": [{
              "role":
                  "system",
              "content":
                  "You are a factual summarizer of partial documents. Stick to the facts (including partial info when necessary to avoid making up potentially incorrect details), and say I don't know when necessary."
          }, {
              "role":
                  "user",
              "content":
                  f"Provide a comprehensive summary of the given text, based on this question:\n{doc.page_content}\nQuestion: {user_question}\nThe summary should cover all the key points that are relevant to the question, while also condensing the information into a concise format. The length of the summary should be as short as possible, without losing relevant information.\nMake use of direct quotes from the text.\nFeel free to include references, sentence fragments, keywords or anything that could help someone learn about it, only as it relates to the given question.\nIf the text does not provide information to answer the question, please write 'None' and nothing else.",
          }],
          "n": 1,
          "max_tokens": 600,
          "metadata": doc.metadata
      }
      requests.append(dictionary)

    oai = OpenAIAPIProcessor(
        input_prompts_list=requests,
        request_url='https://api.openai.com/v1/chat/completions',
        api_key=os.getenv("OPENAI_API_KEY"),
        max_requests_per_minute=1500,
        max_tokens_per_minute=90000,
        token_encoding_name='cl100k_base',  # nosec -- reasonable bandit error suppression
        max_attempts=5,
        logging_level=20)

    chain_start_time = time.monotonic()
    asyncio.run(oai.process_api_requests_from_file())
    results: list[str] = oai.results
    print(f"⏰ EXTREME context stuffing runtime: {(time.monotonic() - chain_start_time):.2f} seconds")

    print(f"Cleaned results: {oai.cleaned_results}")

    all_texts = ""
    separator = '---'  # between each context
    token_counter = 0  #keeps track of tokens in each summarization
    max_tokens = 7_500  #limit, will keep adding text to string until 8000 tokens reached.
    for i, text in enumerate(oai.cleaned_results):
      if text.lower().startswith('none') or text.lower().endswith('none.') or text.lower().endswith('none'):
        # no useful text, it replied with a summary of "None"
        continue
      if text is not None:
        if "pagenumber" not in results[i][-1].keys():  # type: ignore
          results[i][-1]['pagenumber'] = results[i][-1].get('pagenumber_or_timestamp')  # type: ignore
        num_tokens, prompt_cost = count_tokens_and_cost(text)  # type: ignore
        if token_counter + num_tokens > max_tokens:
          print(f"Total tokens yet in loop {i} is {num_tokens}")
          break  # Stop building the string if it exceeds the maximum number of tokens
        token_counter += num_tokens
        filename = str(results[i][-1].get('readable_filename', ''))  # type: ignore
        pagenumber_or_timestamp = str(results[i][-1].get('pagenumber', ''))  # type: ignore
        pagenumber = f", page: {pagenumber_or_timestamp}" if pagenumber_or_timestamp else ''
        doc = f"Document : filename: {filename}" + pagenumber
        summary = f"\nSummary: {text}"
        all_texts += doc + summary + '\n' + separator + '\n'

    stuffed_prompt = """Please answer the following question.
Use the context below, called 'your documents', only if it's helpful and don't use parts that are very irrelevant.
It's good to quote 'your documents' directly using informal citations, like "in document X it says Y". Try to avoid giving false or misleading information. Feel free to say you don't know.
Try to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable.
That said, be practical and really do your best, and don't let caution get too much in the way of being useful.
To help answer the question, here's a few passages of high quality documents:\n{all_texts}
Now please respond to my question: {user_question}"""

    # "Please answer the following question. It's good to quote 'your documents' directly, something like 'from ABS source it says XYZ' Feel free to say you don't know. \nHere's a few passages of the high quality 'your documents':\n"

    return stuffed_prompt

  def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: int = 7_000) -> str:
    """
    Returns
      String: A fully formatted prompt string.
    """
    try:
      top_n = 90
      start_time_overall = time.monotonic()
      o = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE)
      user_query_embedding = o.embed_documents(search_query)[0]  # type: ignore
      myfilter = models.Filter(must=[
          models.FieldCondition(key='course_name', match=models.MatchValue(value=course_name)),
      ])

      found_docs = self.qdrant_client.search(
          collection_name=os.environ['QDRANT_COLLECTION_NAME'],
          query_filter=myfilter,
          with_vectors=False,
          query_vector=user_query_embedding,
          limit=top_n  # Return 5 closest points
      )
      print("Search results: ", found_docs)
      if len(found_docs) == 0:
        return search_query

      pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"

      # count tokens at start and end, then also count each context.
      token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' +
                                               search_query)  # type: ignore
      valid_docs = []
      for d in found_docs:
        if d.payload is not None:
          if "pagenumber" not in d.payload.keys():
            d.payload["pagenumber"] = d.payload["pagenumber_or_timestamp"]

          doc_string = f"---\nDocument: {d.payload['readable_filename']}{', page: ' + str(d.payload['pagenumber']) if d.payload['pagenumber'] else ''}\n{d.payload.get('page_content')}\n"
          num_tokens, prompt_cost = count_tokens_and_cost(doc_string)  # type: ignore

          # print(f"Page: {d.payload.get('page_content', ' '*100)[:100]}...")
          print(
              f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, prompt cost of chunk: {prompt_cost}. 📄 File: {d.payload.get('readable_filename', '')}"
          )
          if token_counter + num_tokens <= token_limit:
            token_counter += num_tokens
            valid_docs.append(
                Document(page_content=d.payload.get('page_content', '<Missing page content>'), metadata=d.payload))
          else:
            continue

      # Convert the valid_docs to full prompt
      separator = '---\n'  # between each context
      context_text = separator.join(
          f"Document: {d.metadata['readable_filename']}{', page: ' + str(d.metadata['pagenumber']) if d.metadata['pagenumber'] else ''}\n{d.page_content}\n"
          for d in valid_docs)

      # Create the stuffedPrompt
      stuffedPrompt = (pre_prompt + context_text + '\n\nNow please respond to my query: ' + search_query)

      TOTAL_num_tokens, prompt_cost = count_tokens_and_cost(stuffedPrompt, openai_model_name='gpt-4')  # type: ignore
      print(f"Total tokens: {TOTAL_num_tokens}, prompt_cost: {prompt_cost}")
      print("total docs: ", len(found_docs))
      print("num docs used: ", len(valid_docs))

      print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds")
      return stuffedPrompt
    except Exception as e:
      # return full traceback to front end
      err: str = f"Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
      print(err)
      sentry_sdk.capture_exception(e)
      return err

  def format_for_json(self, found_docs: List[Document]) -> List[Dict]:
    """Formatting only.
      {'course_name': course_name, 'contexts': [{'source_name': 'Lumetta_notes', 'source_location': 'pg. 19', 'text': 'In FSM, we do this...'}, {'source_name': 'Lumetta_notes', 'source_location': 'pg. 20', 'text': 'In Assembly language, the code does that...'},]}

    Args:
        found_docs (List[Document]): _description_

    Raises:
        Exception: _description_

    Returns:
        List[Dict]: _description_
    """
    for found_doc in found_docs:
      if "pagenumber" not in found_doc.metadata.keys():
        print("found no pagenumber")
        found_doc.metadata['pagenumber'] = found_doc.metadata['pagenumber_or_timestamp']

    contexts = [
        {
            'text': doc.page_content,
            'readable_filename': doc.metadata['readable_filename'],
            'course_name ': doc.metadata['course_name'],
            's3_path': doc.metadata['s3_path'],
            'pagenumber': doc.metadata['pagenumber'],  # this because vector db schema is older...
            # OPTIONAL PARAMS...
            'url': doc.metadata.get('url'),  # wouldn't this error out?
            'base_url': doc.metadata.get('base_url'),
        } for doc in found_docs
    ]

    return contexts

  def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]]) -> bool:
    """
    For given metadata, fetch docs from Supabase based on S3 path or URL.
    If docs exists, concatenate the texts and compare with current texts, if same, return True.
    """
    doc_table = os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE', '')
    course_name = metadatas[0]['course_name']
    incoming_s3_path = metadatas[0]['s3_path']
    url = metadatas[0]['url']
    original_filename = incoming_s3_path.split('/')[-1][37:]  # remove the 37-char uuid prefix

    # check if uuid exists in s3_path -- not all s3_paths have uuids!
    incoming_filename = incoming_s3_path.split('/')[-1]
    pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}',
                         re.I)  # uuid V4 pattern, and v4 only.
    if bool(pattern.search(incoming_filename)):
      # uuid pattern exists -- remove the uuid and proceed with duplicate checking
      original_filename = incoming_filename[37:]
    else:
      # do not remove anything and proceed with duplicate checking
      original_filename = incoming_filename

    if incoming_s3_path:
      filename = incoming_s3_path
      supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq(
          'course_name', course_name).like('s3_path', '%' + original_filename + '%').order('id', desc=True).execute()
      supabase_contents = supabase_contents.data
    elif url:
      filename = url
      supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq(
          'course_name', course_name).eq('url', url).order('id', desc=True).execute()
      supabase_contents = supabase_contents.data
    else:
      filename = None
      supabase_contents = []

    supabase_whole_text = ""
    if len(supabase_contents) > 0:  # if a doc with same filename exists in Supabase
      # concatenate texts
      supabase_contexts = supabase_contents[0]
      for text in supabase_contexts['contexts']:
        supabase_whole_text += text['text']

      current_whole_text = ""
      for text in texts:
        current_whole_text += text['input']

      if supabase_whole_text == current_whole_text:  # matches the previous file
        print(f"Duplicate ingested! 📄 s3_path: {filename}.")
        return True

      else:  # the file is updated
        print(f"Updated file detected! Same filename, new contents. 📄 s3_path: {filename}")

        # call the delete function on older docs
        for content in supabase_contents:
          print("older s3_path to be deleted: ", content['s3_path'])
          delete_status = self.delete_data(course_name, content['s3_path'], '')
          print("delete_status: ", delete_status)
        return False

    else:  # filename does not already exist in Supabase, so its a brand new file
      print(f"NOT a duplicate! 📄s3_path: {filename}")
      return False

`init()`

Initialize AWS S3, Qdrant, and Supabase.

Source code in ai_ta_backend/vector_database.py

def __init__(self):
  """
  Initialize AWS S3, Qdrant, and Supabase.
  """
  openai.api_key = os.getenv("OPENAI_API_KEY")

  # vector DB
  self.qdrant_client = QdrantClient(
      url=os.getenv('QDRANT_URL'),
      api_key=os.getenv('QDRANT_API_KEY'),
  )

  self.vectorstore = Qdrant(client=self.qdrant_client,
                            collection_name=os.environ['QDRANT_COLLECTION_NAME'],
                            embeddings=OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE))

  # S3
  self.s3_client = boto3.client(
      's3',
      aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
      aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
  )

  # Create a Supabase client
  self.supabase_client = supabase.create_client(  # type: ignore
      supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY'])

  self.llm = AzureChatOpenAI(
      temperature=0,
      deployment_name=os.getenv('AZURE_OPENAI_ENGINE'),  #type:ignore
      openai_api_base=os.getenv('AZURE_OPENAI_ENDPOINT'),  #type:ignore
      openai_api_key=os.getenv('AZURE_OPENAI_KEY'),  #type:ignore
      openai_api_version=os.getenv('OPENAI_API_VERSION'),  #type:ignore
      openai_api_type=OPENAI_API_TYPE)

  self.posthog = Posthog(sync_mode=True,
                         project_api_key=os.environ['POSTHOG_API_KEY'],
                         host='https://app.posthog.com')

  return None

`batch_vector_search(search_queries, course_name, top_n=50)`

Perform a similarity search for all the generated queries at once.

Source code in ai_ta_backend/vector_database.py

def batch_vector_search(self, search_queries: List[str], course_name: str, top_n: int = 50):
  """
  Perform a similarity search for all the generated queries at once.
  """
  start_time = time.monotonic()

  from qdrant_client.http import models as rest
  o = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE)
  # Prepare the filter for the course name
  myfilter = rest.Filter(must=[
      rest.FieldCondition(key='course_name', match=rest.MatchValue(value=course_name)),
  ])

  # Prepare the search requests
  search_requests = []
  for query in search_queries:
    user_query_embedding = o.embed_query(query)
    search_requests.append(
        rest.SearchRequest(vector=user_query_embedding,
                           filter=myfilter,
                           limit=top_n,
                           with_payload=True,
                           params=models.SearchParams(quantization=models.QuantizationSearchParams(rescore=False))))

  # Perform the batch search
  search_results = self.qdrant_client.search_batch(
      collection_name=os.environ['QDRANT_COLLECTION_NAME'],
      requests=search_requests,
  )
  # process search results
  found_docs: list[list[Document]] = []
  for result in search_results:
    docs = []
    for doc in result:
      try:
        metadata = doc.payload
        page_content = metadata['page_content']
        del metadata['page_content']

        if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys():
          metadata["pagenumber"] = metadata["pagenumber_or_timestamp"]

        docs.append(Document(page_content=page_content, metadata=metadata))
      except Exception:
        print(traceback.print_exc())
    found_docs.append(docs)

  print(f"⏰ Qdrant Batch Search runtime: {(time.monotonic() - start_time):.2f} seconds")
  return found_docs

`check_for_duplicates(texts, metadatas)`

For given metadata, fetch docs from Supabase based on S3 path or URL. If docs exists, concatenate the texts and compare with current texts, if same, return True.

Source code in ai_ta_backend/vector_database.py

def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]]) -> bool:
  """
  For given metadata, fetch docs from Supabase based on S3 path or URL.
  If docs exists, concatenate the texts and compare with current texts, if same, return True.
  """
  doc_table = os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE', '')
  course_name = metadatas[0]['course_name']
  incoming_s3_path = metadatas[0]['s3_path']
  url = metadatas[0]['url']
  original_filename = incoming_s3_path.split('/')[-1][37:]  # remove the 37-char uuid prefix

  # check if uuid exists in s3_path -- not all s3_paths have uuids!
  incoming_filename = incoming_s3_path.split('/')[-1]
  pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}',
                       re.I)  # uuid V4 pattern, and v4 only.
  if bool(pattern.search(incoming_filename)):
    # uuid pattern exists -- remove the uuid and proceed with duplicate checking
    original_filename = incoming_filename[37:]
  else:
    # do not remove anything and proceed with duplicate checking
    original_filename = incoming_filename

  if incoming_s3_path:
    filename = incoming_s3_path
    supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq(
        'course_name', course_name).like('s3_path', '%' + original_filename + '%').order('id', desc=True).execute()
    supabase_contents = supabase_contents.data
  elif url:
    filename = url
    supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq(
        'course_name', course_name).eq('url', url).order('id', desc=True).execute()
    supabase_contents = supabase_contents.data
  else:
    filename = None
    supabase_contents = []

  supabase_whole_text = ""
  if len(supabase_contents) > 0:  # if a doc with same filename exists in Supabase
    # concatenate texts
    supabase_contexts = supabase_contents[0]
    for text in supabase_contexts['contexts']:
      supabase_whole_text += text['text']

    current_whole_text = ""
    for text in texts:
      current_whole_text += text['input']

    if supabase_whole_text == current_whole_text:  # matches the previous file
      print(f"Duplicate ingested! 📄 s3_path: {filename}.")
      return True

    else:  # the file is updated
      print(f"Updated file detected! Same filename, new contents. 📄 s3_path: {filename}")

      # call the delete function on older docs
      for content in supabase_contents:
        print("older s3_path to be deleted: ", content['s3_path'])
        delete_status = self.delete_data(course_name, content['s3_path'], '')
        print("delete_status: ", delete_status)
      return False

  else:  # filename does not already exist in Supabase, so its a brand new file
    print(f"NOT a duplicate! 📄s3_path: {filename}")
    return False

`delete_data(course_name, s3_path, source_url)`

Delete file from S3, Qdrant, and Supabase.

Source code in ai_ta_backend/vector_database.py

def delete_data(self, course_name: str, s3_path: str, source_url: str):
  """Delete file from S3, Qdrant, and Supabase."""
  print(f"Deleting {s3_path} from S3, Qdrant, and Supabase for course {course_name}")
  # add delete from doc map logic here
  try:
    # Delete file from S3
    bucket_name = os.getenv('S3_BUCKET_NAME')

    # Delete files by S3 path
    if s3_path:
      try:
        self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path)
      except Exception as e:
        print("Error in deleting file from s3:", e)
        sentry_sdk.capture_exception(e)
      # Delete from Qdrant
      # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key
      # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18  \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training  ...
      try:
        self.qdrant_client.delete(
            collection_name=os.environ['QDRANT_COLLECTION_NAME'],
            points_selector=models.Filter(must=[
                models.FieldCondition(
                    key="s3_path",
                    match=models.MatchValue(value=s3_path),
                ),
            ]),
        )
      except Exception as e:
        if "timed out" in str(e):
          # Timed out is fine. Still deletes.
          # https://github.com/qdrant/qdrant/issues/3654#issuecomment-1955074525
          pass
        else:
          print("Error in deleting file from Qdrant:", e)
          sentry_sdk.capture_exception(e)
      try:
        # delete from Nomic
        response = self.supabase_client.from_(
            os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq(
                's3_path', s3_path).eq('course_name', course_name).execute()
        data = response.data[0]  #single record fetched
        nomic_ids_to_delete = []
        context_count = len(data['contexts'])
        for i in range(1, context_count + 1):
          nomic_ids_to_delete.append(str(data['id']) + "_" + str(i))

        # delete from Nomic
        res = delete_from_document_map(course_name, nomic_ids_to_delete)
      except Exception as e:
        print("Error in deleting file from Nomic:", e)
        sentry_sdk.capture_exception(e)

      try:
        self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(
            's3_path', s3_path).eq('course_name', course_name).execute()
      except Exception as e:
        print("Error in deleting file from supabase:", e)
        sentry_sdk.capture_exception(e)

    # Delete files by their URL identifier
    elif source_url:
      try:
        # Delete from Qdrant
        self.qdrant_client.delete(
            collection_name=os.environ['QDRANT_COLLECTION_NAME'],
            points_selector=models.Filter(must=[
                models.FieldCondition(
                    key="url",
                    match=models.MatchValue(value=source_url),
                ),
            ]),
        )
      except Exception as e:
        if "timed out" in str(e):
          # Timed out is fine. Still deletes.
          # https://github.com/qdrant/qdrant/issues/3654#issuecomment-1955074525
          pass
        else:
          print("Error in deleting file from Qdrant:", e)
          sentry_sdk.capture_exception(e)
      try:
        # delete from Nomic
        response = self.supabase_client.from_(
            os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, url, contexts").eq(
                'url', source_url).eq('course_name', course_name).execute()
        data = response.data[0]  #single record fetched
        nomic_ids_to_delete = []
        context_count = len(data['contexts'])
        for i in range(1, context_count + 1):
          nomic_ids_to_delete.append(str(data['id']) + "_" + str(i))

        # delete from Nomic
        res = delete_from_document_map(course_name, nomic_ids_to_delete)
      except Exception as e:
        print("Error in deleting file from Nomic:", e)
        sentry_sdk.capture_exception(e)

      try:
        # delete from Supabase
        self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(
            'url', source_url).eq('course_name', course_name).execute()
      except Exception as e:
        print("Error in deleting file from supabase:", e)
        sentry_sdk.capture_exception(e)

    # Delete from Supabase
    return "Success"
  except Exception as e:
    err: str = f"ERROR IN delete_data: Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
    print(err)
    sentry_sdk.capture_exception(e)
    return err

`delete_entire_course(course_name)`

Delete entire course.

Delete materials from S3, Supabase SQL, Vercel KV, and QDrant vector DB Args: course_name (str): description

Source code in ai_ta_backend/vector_database.py

def delete_entire_course(self, course_name: str):
  """Delete entire course.

  Delete materials from S3, Supabase SQL, Vercel KV, and QDrant vector DB
  Args:
      course_name (str): _description_
  """
  print(f"Deleting entire course: {course_name}")
  try:
    # Delete file from S3
    print("Deleting from S3")
    objects_to_delete = self.s3_client.list_objects(Bucket=os.getenv('S3_BUCKET_NAME'),
                                                    Prefix=f'courses/{course_name}/')
    for object in objects_to_delete['Contents']:
      self.s3_client.delete_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=object['Key'])
  except Exception as e:
    err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
    print(err)
    sentry_sdk.capture_exception(e)
    pass

  try:
    # Delete from Qdrant
    # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key
    # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18  \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None),
    print("deleting from qdrant")
    self.qdrant_client.delete(
        collection_name=os.environ['QDRANT_COLLECTION_NAME'],
        points_selector=models.Filter(must=[
            models.FieldCondition(
                key="course_name",
                match=models.MatchValue(value=course_name),
            ),
        ]),
    )
  except Exception as e:
    err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
    print(err)
    sentry_sdk.capture_exception(e)
    pass

  try:
    # Delete from Supabase
    print("deleting from supabase")
    response = self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(
        'course_name', course_name).execute()
    print("supabase response: ", response)
    return "Success"
  except Exception as e:
    err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
    print(err)
    sentry_sdk.capture_exception(e)

`format_for_json(found_docs)`

Formatting only. {'course_name': course_name, 'contexts': [{'source_name': 'Lumetta_notes', 'source_location': 'pg. 19', 'text': 'In FSM, we do this...'}, {'source_name': 'Lumetta_notes', 'source_location': 'pg. 20', 'text': 'In Assembly language, the code does that...'},]}

Parameters:

Name	Type	Description	Default
`found_docs`	`List[Document]`	description	required

Raises:

Type	Description
`Exception`	description

Returns:

Type	Description
`List[Dict]`	List[Dict]: description

Source code in ai_ta_backend/vector_database.py

def format_for_json(self, found_docs: List[Document]) -> List[Dict]:
  """Formatting only.
    {'course_name': course_name, 'contexts': [{'source_name': 'Lumetta_notes', 'source_location': 'pg. 19', 'text': 'In FSM, we do this...'}, {'source_name': 'Lumetta_notes', 'source_location': 'pg. 20', 'text': 'In Assembly language, the code does that...'},]}

  Args:
      found_docs (List[Document]): _description_

  Raises:
      Exception: _description_

  Returns:
      List[Dict]: _description_
  """
  for found_doc in found_docs:
    if "pagenumber" not in found_doc.metadata.keys():
      print("found no pagenumber")
      found_doc.metadata['pagenumber'] = found_doc.metadata['pagenumber_or_timestamp']

  contexts = [
      {
          'text': doc.page_content,
          'readable_filename': doc.metadata['readable_filename'],
          'course_name ': doc.metadata['course_name'],
          's3_path': doc.metadata['s3_path'],
          'pagenumber': doc.metadata['pagenumber'],  # this because vector db schema is older...
          # OPTIONAL PARAMS...
          'url': doc.metadata.get('url'),  # wouldn't this error out?
          'base_url': doc.metadata.get('base_url'),
      } for doc in found_docs
  ]

  return contexts

`format_for_json_mqr(found_docs)`

Same as format_for_json, but for the new MQR pipeline.

Source code in ai_ta_backend/vector_database.py

def format_for_json_mqr(self, found_docs) -> List[Dict]:
  """
  Same as format_for_json, but for the new MQR pipeline.
  """
  for found_doc in found_docs:
    if "pagenumber" not in found_doc.keys():
      print("found no pagenumber")
      found_doc['pagenumber'] = found_doc['pagenumber_or_timestamp']

  contexts = [
      {
          'text': doc['text'],
          'readable_filename': doc['readable_filename'],
          'course_name ': doc['course_name'],
          's3_path': doc['s3_path'],
          'pagenumber': doc['pagenumber'],
          'url': doc['url'],  # wouldn't this error out?
          'base_url': doc['base_url'],
      } for doc in found_docs
  ]

  return contexts

`getAll(course_name)`

Get all course materials based on course name. Args: course_name (as uploaded on supabase) Returns: list of dictionaries with distinct s3 path, readable_filename and course_name, url, base_url.

Source code in ai_ta_backend/vector_database.py

def getAll(
    self,
    course_name: str,
):
  """Get all course materials based on course name.
  Args:
      course_name (as uploaded on supabase)
  Returns:
      list of dictionaries with distinct s3 path, readable_filename and course_name, url, base_url.
  """

  response = self.supabase_client.table(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select(
      'course_name, s3_path, readable_filename, url, base_url').eq('course_name', course_name).execute()

  data = response.data
  unique_combinations = set()
  distinct_dicts = []

  for item in data:
    combination = (item['s3_path'], item['readable_filename'], item['course_name'], item['url'], item['base_url'])
    if combination not in unique_combinations:
      unique_combinations.add(combination)
      distinct_dicts.append(item)

  return distinct_dicts

`getTopContexts(search_query, course_name, token_limit=4000)`

Here's a summary of the work.

/GET arguments course name (optional) str: A json response with TBD fields.

Returns JSON: A json response with TBD fields. See main.py:getTopContexts docs. or String: An error message with traceback.

Source code in ai_ta_backend/vector_database.py

def getTopContexts(self, search_query: str, course_name: str, token_limit: int = 4_000) -> Union[List[Dict], str]:
  """Here's a summary of the work.

  /GET arguments
    course name (optional) str: A json response with TBD fields.

  Returns
    JSON: A json response with TBD fields. See main.py:getTopContexts docs.
    or
    String: An error message with traceback.
  """
  try:
    start_time_overall = time.monotonic()

    found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name)

    pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
    # count tokens at start and end, then also count each context.
    token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' +
                                             search_query)  # type: ignore

    valid_docs = []
    num_tokens = 0
    for doc in found_docs:
      doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n"
      num_tokens, prompt_cost = count_tokens_and_cost(doc_string)  # type: ignore

      print(
          f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, total prompt cost (of these contexts): {prompt_cost}. 📄 File: {doc.metadata['readable_filename']}"
      )
      if token_counter + num_tokens <= token_limit:
        token_counter += num_tokens
        valid_docs.append(doc)
      else:
        # filled our token size, time to return
        break

    print(f"Total tokens used: {token_counter}. Docs used: {len(valid_docs)} of {len(found_docs)} docs retrieved")
    print(f"Course: {course_name} ||| search_query: {search_query}")
    print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds")
    if len(valid_docs) == 0:
      return []

    self.posthog.capture('distinct_id_of_the_user',
                         event='success_get_top_contexts_OG',
                         properties={
                             'user_query': search_query,
                             'course_name': course_name,
                             'token_limit': token_limit,
                             'total_tokens_used': token_counter,
                             'total_contexts_used': len(valid_docs),
                             'total_unique_docs_retrieved': len(found_docs),
                             'getTopContext_total_latency_sec': time.monotonic() - start_time_overall,
                         })

    return self.format_for_json(valid_docs)
  except Exception as e:
    # return full traceback to front end
    err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}"  # type: ignore
    print(err)
    sentry_sdk.capture_exception(e)
    return err

`getTopContextsWithMQR(search_query, course_name, token_limit=4000)`

New info-retrieval pipeline that uses multi-query retrieval + filtering + reciprocal rank fusion + context padding. 1. Generate multiple queries based on the input search query. 2. Retrieve relevant docs for each query. 3. Filter the relevant docs based on the user query and pass them to the rank fusion step. 4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score. 5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document.

Source code in ai_ta_backend/vector_database.py

def getTopContextsWithMQR(self,
                          search_query: str,
                          course_name: str,
                          token_limit: int = 4_000) -> Union[List[Dict], str]:
  """
  New info-retrieval pipeline that uses multi-query retrieval + filtering + reciprocal rank fusion + context padding.
  1. Generate multiple queries based on the input search query.
  2. Retrieve relevant docs for each query.
  3. Filter the relevant docs based on the user query and pass them to the rank fusion step.
  4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score.
  5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document.
  """
  return 'fail'

`get_context_stuffed_prompt(user_question, course_name, top_n, top_k_to_search)`

Get a stuffed prompt for a given user question and course name. Args: user_question (str) course_name (str) : used for metadata filtering Returns : str a very long "stuffed prompt" with question + summaries of top_n most relevant documents.

Source code in ai_ta_backend/vector_database.py

  def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n: int, top_k_to_search: int) -> str:
    """
    Get a stuffed prompt for a given user question and course name.
    Args:
      user_question (str)
      course_name (str) : used for metadata filtering
    Returns : str
      a very long "stuffed prompt" with question + summaries of top_n most relevant documents.
    """
    # MMR with metadata filtering based on course_name
    vec_start_time = time.monotonic()
    found_docs = self.vectorstore.max_marginal_relevance_search(user_question, k=top_n, fetch_k=top_k_to_search)
    print(
        f"⏰ MMR Search runtime (top_n_to_keep: {top_n}, top_k_to_search: {top_k_to_search}): {(time.monotonic() - vec_start_time):.2f} seconds"
    )

    requests = []
    for doc in found_docs:
      print("doc", doc)
      dictionary = {
          "model": "gpt-3.5-turbo",
          "messages": [{
              "role":
                  "system",
              "content":
                  "You are a factual summarizer of partial documents. Stick to the facts (including partial info when necessary to avoid making up potentially incorrect details), and say I don't know when necessary."
          }, {
              "role":
                  "user",
              "content":
                  f"Provide a comprehensive summary of the given text, based on this question:\n{doc.page_content}\nQuestion: {user_question}\nThe summary should cover all the key points that are relevant to the question, while also condensing the information into a concise format. The length of the summary should be as short as possible, without losing relevant information.\nMake use of direct quotes from the text.\nFeel free to include references, sentence fragments, keywords or anything that could help someone learn about it, only as it relates to the given question.\nIf the text does not provide information to answer the question, please write 'None' and nothing else.",
          }],
          "n": 1,
          "max_tokens": 600,
          "metadata": doc.metadata
      }
      requests.append(dictionary)

    oai = OpenAIAPIProcessor(
        input_prompts_list=requests,
        request_url='https://api.openai.com/v1/chat/completions',
        api_key=os.getenv("OPENAI_API_KEY"),
        max_requests_per_minute=1500,
        max_tokens_per_minute=90000,
        token_encoding_name='cl100k_base',  # nosec -- reasonable bandit error suppression
        max_attempts=5,
        logging_level=20)

    chain_start_time = time.monotonic()
    asyncio.run(oai.process_api_requests_from_file())
    results: list[str] = oai.results
    print(f"⏰ EXTREME context stuffing runtime: {(time.monotonic() - chain_start_time):.2f} seconds")

    print(f"Cleaned results: {oai.cleaned_results}")

    all_texts = ""
    separator = '---'  # between each context
    token_counter = 0  #keeps track of tokens in each summarization
    max_tokens = 7_500  #limit, will keep adding text to string until 8000 tokens reached.
    for i, text in enumerate(oai.cleaned_results):
      if text.lower().startswith('none') or text.lower().endswith('none.') or text.lower().endswith('none'):
        # no useful text, it replied with a summary of "None"
        continue
      if text is not None:
        if "pagenumber" not in results[i][-1].keys():  # type: ignore
          results[i][-1]['pagenumber'] = results[i][-1].get('pagenumber_or_timestamp')  # type: ignore
        num_tokens, prompt_cost = count_tokens_and_cost(text)  # type: ignore
        if token_counter + num_tokens > max_tokens:
          print(f"Total tokens yet in loop {i} is {num_tokens}")
          break  # Stop building the string if it exceeds the maximum number of tokens
        token_counter += num_tokens
        filename = str(results[i][-1].get('readable_filename', ''))  # type: ignore
        pagenumber_or_timestamp = str(results[i][-1].get('pagenumber', ''))  # type: ignore
        pagenumber = f", page: {pagenumber_or_timestamp}" if pagenumber_or_timestamp else ''
        doc = f"Document : filename: {filename}" + pagenumber
        summary = f"\nSummary: {text}"
        all_texts += doc + summary + '\n' + separator + '\n'

    stuffed_prompt = """Please answer the following question.
Use the context below, called 'your documents', only if it's helpful and don't use parts that are very irrelevant.
It's good to quote 'your documents' directly using informal citations, like "in document X it says Y". Try to avoid giving false or misleading information. Feel free to say you don't know.
Try to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable.
That said, be practical and really do your best, and don't let caution get too much in the way of being useful.
To help answer the question, here's a few passages of high quality documents:\n{all_texts}
Now please respond to my question: {user_question}"""

    # "Please answer the following question. It's good to quote 'your documents' directly, something like 'from ABS source it says XYZ' Feel free to say you don't know. \nHere's a few passages of the high quality 'your documents':\n"

    return stuffed_prompt

`get_stuffed_prompt(search_query, course_name, token_limit=7000)`

Returns String: A fully formatted prompt string.

Source code in ai_ta_backend/vector_database.py

def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: int = 7_000) -> str:
  """
  Returns
    String: A fully formatted prompt string.
  """
  try:
    top_n = 90
    start_time_overall = time.monotonic()
    o = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE)
    user_query_embedding = o.embed_documents(search_query)[0]  # type: ignore
    myfilter = models.Filter(must=[
        models.FieldCondition(key='course_name', match=models.MatchValue(value=course_name)),
    ])

    found_docs = self.qdrant_client.search(
        collection_name=os.environ['QDRANT_COLLECTION_NAME'],
        query_filter=myfilter,
        with_vectors=False,
        query_vector=user_query_embedding,
        limit=top_n  # Return 5 closest points
    )
    print("Search results: ", found_docs)
    if len(found_docs) == 0:
      return search_query

    pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"

    # count tokens at start and end, then also count each context.
    token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' +
                                             search_query)  # type: ignore
    valid_docs = []
    for d in found_docs:
      if d.payload is not None:
        if "pagenumber" not in d.payload.keys():
          d.payload["pagenumber"] = d.payload["pagenumber_or_timestamp"]

        doc_string = f"---\nDocument: {d.payload['readable_filename']}{', page: ' + str(d.payload['pagenumber']) if d.payload['pagenumber'] else ''}\n{d.payload.get('page_content')}\n"
        num_tokens, prompt_cost = count_tokens_and_cost(doc_string)  # type: ignore

        # print(f"Page: {d.payload.get('page_content', ' '*100)[:100]}...")
        print(
            f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, prompt cost of chunk: {prompt_cost}. 📄 File: {d.payload.get('readable_filename', '')}"
        )
        if token_counter + num_tokens <= token_limit:
          token_counter += num_tokens
          valid_docs.append(
              Document(page_content=d.payload.get('page_content', '<Missing page content>'), metadata=d.payload))
        else:
          continue

    # Convert the valid_docs to full prompt
    separator = '---\n'  # between each context
    context_text = separator.join(
        f"Document: {d.metadata['readable_filename']}{', page: ' + str(d.metadata['pagenumber']) if d.metadata['pagenumber'] else ''}\n{d.page_content}\n"
        for d in valid_docs)

    # Create the stuffedPrompt
    stuffedPrompt = (pre_prompt + context_text + '\n\nNow please respond to my query: ' + search_query)

    TOTAL_num_tokens, prompt_cost = count_tokens_and_cost(stuffedPrompt, openai_model_name='gpt-4')  # type: ignore
    print(f"Total tokens: {TOTAL_num_tokens}, prompt_cost: {prompt_cost}")
    print("total docs: ", len(found_docs))
    print("num docs used: ", len(valid_docs))

    print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds")
    return stuffedPrompt
  except Exception as e:
    # return full traceback to front end
    err: str = f"Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
    print(err)
    sentry_sdk.capture_exception(e)
    return err

`ingest_coursera(coursera_course_name, course_name)`

Download all the files from a coursera course and ingest them.

Download the coursera content.
Upload to S3 (so users can view it)
Run everything through the ingest_bulk method.

Parameters:

Name	Type	Description	Default
`coursera_course_name`	`str`	The name of the coursera course.	required
`course_name`	`str`	The name of the course in our system.	required

Returns:

Name	Type	Description
`_type_`	`str`	Success or error message.

Source code in ai_ta_backend/vector_database.py

def ingest_coursera(self, coursera_course_name: str, course_name: str) -> str:
  """ Download all the files from a coursera course and ingest them.

  1. Download the coursera content.
  2. Upload to S3 (so users can view it)
  3. Run everything through the ingest_bulk method.

  Args:
      coursera_course_name (str): The name of the coursera course.
      course_name (str): The name of the course in our system.

  Returns:
      _type_: Success or error message.
  """
  certificate = "-ca 'FVhVoDp5cb-ZaoRr5nNJLYbyjCLz8cGvaXzizqNlQEBsG5wSq7AHScZGAGfC1nI0ehXFvWy1NG8dyuIBF7DLMA.X3cXsDvHcOmSdo3Fyvg27Q.qyGfoo0GOHosTVoSMFy-gc24B-_BIxJtqblTzN5xQWT3hSntTR1DMPgPQKQmfZh_40UaV8oZKKiF15HtZBaLHWLbpEpAgTg3KiTiU1WSdUWueo92tnhz-lcLeLmCQE2y3XpijaN6G4mmgznLGVsVLXb-P3Cibzz0aVeT_lWIJNrCsXrTFh2HzFEhC4FxfTVqS6cRsKVskPpSu8D9EuCQUwJoOJHP_GvcME9-RISBhi46p-Z1IQZAC4qHPDhthIJG4bJqpq8-ZClRL3DFGqOfaiu5y415LJcH--PRRKTBnP7fNWPKhcEK2xoYQLr9RxBVL3pzVPEFyTYtGg6hFIdJcjKOU11AXAnQ-Kw-Gb_wXiHmu63veM6T8N2dEkdqygMre_xMDT5NVaP3xrPbA4eAQjl9yov4tyX4AQWMaCS5OCbGTpMTq2Y4L0Mbz93MHrblM2JL_cBYa59bq7DFK1IgzmOjFhNG266mQlC9juNcEhc'"
  always_use_flags = "-u kastanvday@gmail.com -p hSBsLaF5YM469# --ignore-formats mp4 --subtitle-language en --path ./coursera-dl"

  try:
    subprocess.run(
        f"coursera-dl {always_use_flags} {certificate} {coursera_course_name}",
        check=True,
        shell=True,  # nosec -- reasonable bandit error suppression
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE)  # capture_output=True,
    dl_results_path = os.path.join('coursera-dl', coursera_course_name)
    s3_paths: Union[List, None] = upload_data_files_to_s3(course_name, dl_results_path)

    if s3_paths is None:
      return "Error: No files found in the coursera-dl directory"

    print("starting bulk ingest")
    start_time = time.monotonic()
    self.bulk_ingest(s3_paths, course_name)
    print("completed bulk ingest")
    print(f"⏰ Runtime: {(time.monotonic() - start_time):.2f} seconds")

    # Cleanup the coursera downloads
    shutil.rmtree(dl_results_path)

    return "Success"
  except Exception as e:
    err: str = f"Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
    print(err)
    return err

`ingest_github(github_url, course_name)`

Clones the given GitHub URL and uses Langchain to load data. 1. Clone the repo 2. Use Langchain to load the data 3. Pass to split_and_upload() Args: github_url (str): The Github Repo URL to be ingested. course_name (str): The name of the course in our system.

Returns:

Name	Type	Description
`_type_`	`str`	Success or error message.

Source code in ai_ta_backend/vector_database.py

def ingest_github(self, github_url: str, course_name: str) -> str:
  """
  Clones the given GitHub URL and uses Langchain to load data.
  1. Clone the repo
  2. Use Langchain to load the data
  3. Pass to split_and_upload()
  Args:
      github_url (str): The Github Repo URL to be ingested.
      course_name (str): The name of the course in our system.

  Returns:
      _type_: Success or error message.
  """
  try:
    repo_path = "media/cloned_repo"
    repo = Repo.clone_from(github_url, to_path=repo_path, depth=1, clone_submodules=False)
    branch = repo.head.reference

    loader = GitLoader(repo_path="media/cloned_repo", branch=str(branch))
    data = loader.load()
    shutil.rmtree("media/cloned_repo")
    # create metadata for each file in data

    for doc in data:
      texts = doc.page_content
      metadatas: Dict[str, Any] = {
          'course_name': course_name,
          's3_path': '',
          'readable_filename': doc.metadata['file_name'],
          'url': f"{github_url}/blob/main/{doc.metadata['file_path']}",
          'pagenumber': '',
          'timestamp': '',
      }
      self.split_and_upload(texts=[texts], metadatas=[metadatas])
    return "Success"
  except Exception as e:
    err = f"❌❌ Error in (GITHUB ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n{traceback.format_exc()}"
    print(err)
    sentry_sdk.capture_exception(e)
    return err

`ingest_single_web_text(course_name, base_url, url, content, title)`

Crawlee integration

Source code in ai_ta_backend/vector_database.py

def ingest_single_web_text(self, course_name: str, base_url: str, url: str, content: str, title: str):
  """Crawlee integration
  """
  self.posthog.capture('distinct_id_of_the_user',
                       event='ingest_single_web_text_invoked',
                       properties={
                           'course_name': course_name,
                           'base_url': base_url,
                           'url': url,
                           'content': content,
                           'title': title
                       })
  try:
    # if not, ingest the text
    text = [content]
    metadatas: List[Dict[str, Any]] = [{
        'course_name': course_name,
        's3_path': '',
        'readable_filename': title,
        'pagenumber': '',
        'timestamp': '',
        'url': url,
        'base_url': base_url,
    }]
    self.split_and_upload(texts=text, metadatas=metadatas)
    self.posthog.capture('distinct_id_of_the_user',
                         event='ingest_single_web_text_succeeded',
                         properties={
                             'course_name': course_name,
                             'base_url': base_url,
                             'url': url,
                             'title': title
                         })

    return "Success"
  except Exception as e:
    err = f"❌❌ Error in (web text ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
    )  # type: ignore
    print(err)
    sentry_sdk.capture_exception(e)
    return err

`reciprocal_rank_fusion(results, k=60)`

Since we have multiple queries, and n documents returned per query, we need to go through all the results and collect the documents with the highest overall score, as scored by qdrant similarity matching.

Source code in ai_ta_backend/vector_database.py

def reciprocal_rank_fusion(self, results: list[list], k=60):
  """
    Since we have multiple queries, and n documents returned per query, we need to go through all the results
    and collect the documents with the highest overall score, as scored by qdrant similarity matching.
    """
  fused_scores = {}
  count = 0
  unique_count = 0
  for docs in results:
    # Assumes the docs are returned in sorted order of relevance
    count += len(docs)
    for rank, doc in enumerate(docs):
      doc_str = dumps(doc)
      if doc_str not in fused_scores:
        fused_scores[doc_str] = 0
        unique_count += 1
      fused_scores[doc_str] += 1 / (rank + k)
      # Uncomment for debugging
      # previous_score = fused_scores[doc_str]
      #print(f"Change score for doc: {doc_str}, previous score: {previous_score}, updated score: {fused_scores[doc_str]} ")
  print(f"Total number of documents in rank fusion: {count}")
  print(f"Total number of unique documents in rank fusion: {unique_count}")
  reranked_results = [
      (loads(doc), score) for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
  ]
  return reranked_results

`split_and_upload(texts, metadatas)`

This is usually the last step of document ingest. Chunk & upload to Qdrant (and Supabase.. todo). Takes in Text and Metadata (from Langchain doc loaders) and splits / uploads to Qdrant.

good examples here: https://langchain.readthedocs.io/en/latest/modules/utils/combine_docs_examples/textsplitter.html

Parameters:

Name	Type	Description	Default
`texts`	`List[str]`	description	required
`metadatas`	`List[Dict[str, Any]]`	description	required

Source code in ai_ta_backend/vector_database.py

def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
  """ This is usually the last step of document ingest. Chunk & upload to Qdrant (and Supabase.. todo).
  Takes in Text and Metadata (from Langchain doc loaders) and splits / uploads to Qdrant.

  good examples here: https://langchain.readthedocs.io/en/latest/modules/utils/combine_docs_examples/textsplitter.html

  Args:
      texts (List[str]): _description_
      metadatas (List[Dict[str, Any]]): _description_
  """
  self.posthog.capture('distinct_id_of_the_user',
                       event='split_and_upload_invoked',
                       properties={
                           'course_name': metadatas[0].get('course_name', None),
                           's3_path': metadatas[0].get('s3_path', None),
                           'readable_filename': metadatas[0].get('readable_filename', None),
                           'url': metadatas[0].get('url', None),
                           'base_url': metadatas[0].get('base_url', None),
                       })

  print("In split and upload")
  print(f"metadatas: {metadatas}")
  print(f"Texts: {texts}")
  assert len(texts) == len(
      metadatas
  ), f'must have equal number of text strings and metadata dicts. len(texts) is {len(texts)}. len(metadatas) is {len(metadatas)}'

  try:
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000,
        chunk_overlap=150,
        separators=[
            "\n\n", "\n", ". ", " ", ""
        ]  # try to split on paragraphs... fallback to sentences, then chars, ensure we always fit in context window
    )
    contexts: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas)
    input_texts = [{'input': context.page_content, 'model': 'text-embedding-ada-002'} for context in contexts]

    # check for duplicates
    is_duplicate = self.check_for_duplicates(input_texts, metadatas)
    if is_duplicate:
      self.posthog.capture('distinct_id_of_the_user',
                           event='split_and_upload_succeeded',
                           properties={
                               'course_name': metadatas[0].get('course_name', None),
                               's3_path': metadatas[0].get('s3_path', None),
                               'readable_filename': metadatas[0].get('readable_filename', None),
                               'url': metadatas[0].get('url', None),
                               'base_url': metadatas[0].get('base_url', None),
                               'is_duplicate': True,
                           })
      return "Success"

    # adding chunk index to metadata for parent doc retrieval
    for i, context in enumerate(contexts):
      context.metadata['chunk_index'] = i

    oai = OpenAIAPIProcessor(
        input_prompts_list=input_texts,
        request_url='https://api.openai.com/v1/embeddings',
        api_key=os.getenv('VLADS_OPENAI_KEY'),
        # request_url='https://uiuc-chat-canada-east.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-05-15',
        # api_key=os.getenv('AZURE_OPENAI_KEY'),
        max_requests_per_minute=5_000,
        max_tokens_per_minute=300_000,
        max_attempts=20,
        logging_level=logging.INFO,
        token_encoding_name='cl100k_base')  # nosec -- reasonable bandit error suppression
    asyncio.run(oai.process_api_requests_from_file())
    # parse results into dict of shape page_content -> embedding
    embeddings_dict: dict[str, List[float]] = {
        item[0]['input']: item[1]['data'][0]['embedding'] for item in oai.results
    }

    ### BULK upload to Qdrant ###
    vectors: list[PointStruct] = []
    for context in contexts:
      # !DONE: Updated the payload so each key is top level (no more payload.metadata.course_name. Instead, use payload.course_name), great for creating indexes.
      upload_metadata = {**context.metadata, "page_content": context.page_content}
      vectors.append(
          PointStruct(id=str(uuid.uuid4()), vector=embeddings_dict[context.page_content], payload=upload_metadata))

    self.qdrant_client.upsert(
        collection_name=os.environ['QDRANT_COLLECTION_NAME'],  # type: ignore
        points=vectors  # type: ignore
    )
    ### Supabase SQL ###
    contexts_for_supa = [{
        "text": context.page_content,
        "pagenumber": context.metadata.get('pagenumber'),
        "timestamp": context.metadata.get('timestamp'),
        "chunk_index": context.metadata.get('chunk_index'),
        "embedding": embeddings_dict[context.page_content]
    } for context in contexts]

    document = {
        "course_name": contexts[0].metadata.get('course_name'),
        "s3_path": contexts[0].metadata.get('s3_path'),
        "readable_filename": contexts[0].metadata.get('readable_filename'),
        "url": contexts[0].metadata.get('url'),
        "base_url": contexts[0].metadata.get('base_url'),
        "contexts": contexts_for_supa,
    }

    response = self.supabase_client.table(
        os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute()  # type: ignore

    # add to Nomic document map
    if len(response.data) > 0:
      inserted_data = response.data[0]
      res = log_to_document_map(inserted_data)

    self.posthog.capture('distinct_id_of_the_user',
                         event='split_and_upload_succeeded',
                         properties={
                             'course_name': metadatas[0].get('course_name', None),
                             's3_path': metadatas[0].get('s3_path', None),
                             'readable_filename': metadatas[0].get('readable_filename', None),
                             'url': metadatas[0].get('url', None),
                             'base_url': metadatas[0].get('base_url', None),
                         })
    print("successful END OF split_and_upload")
    return "Success"
  except Exception as e:
    err: str = f"ERROR IN split_and_upload(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore
    print(err)
    sentry_sdk.capture_exception(e)
    return err

AWS endpoints

`upload_data_files_to_s3(course_name, localdir)`

Uploads all files in localdir to S3 bucket.

Parameters:

Name	Type	Description	Default
`course_name`	`str`	Official course name on our website.	required
`localdir`	`str`	Local directory to upload from, coursera-dl downloads to this directory.	required

Returns:

Type	Description
`Optional[List[str]]`	Optional[List[str]]: A list of S3 paths, the final resting place of uploads, or None if no files were uploaded.

Source code in ai_ta_backend/aws.py

def upload_data_files_to_s3(course_name: str, localdir: str) -> Optional[List[str]]:
  """Uploads all files in localdir to S3 bucket.

  Args:
    course_name (str): Official course name on our website.
    localdir (str): Local directory to upload from, coursera-dl downloads to this directory.

  Returns:
    Optional[List[str]]: A list of S3 paths, the final resting place of uploads, or None if no files were uploaded.
  """
  s3 = boto3.client(
      's3',
      aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
      aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
  )

  filenames = []
  for root, _subdirs, files in os.walk(localdir):
    for filename in files:
      filenames.append(os.path.join(root, filename))

  if not filenames:
    print(f"No files to upload. Not found in: {localdir}")
    return None

  print(f"Files to upload: {filenames}")
  print("About to upload...")

  s3_paths = []
  s3_paths_lock = Lock()

  def upload(myfile):
    # get the last part of the path and append unique ID before it
    directory, old_filename = os.path.split(myfile)
    new_filename = str(uuid.uuid4()) + '-' + old_filename
    new_filepath = os.path.join(directory, new_filename)

    s3_file = f"courses/{course_name}/{os.path.basename(new_filepath)}"
    s3.upload_file(myfile, os.getenv('S3_BUCKET_NAME'), s3_file)
    with s3_paths_lock:
      s3_paths.append(s3_file)

  # only 2 parallel uploads because we're getting rate limited with min_p=6... 503 errors.
  min_p = 2
  max_p = cpu_count()
  num_procs = max(min(len(filenames), max_p), min_p)
  pool = ThreadPool(processes=num_procs)
  pool.map(upload, filenames)

  print("All data files uploaded to S3 successfully.")
  return s3_paths

API Reference

Top Level API Reference

add_canvas_users()

delete()

getAll()

getContextStuffedPrompt()

getTopContexts()

GET arguments

Returns

Raises

getTopContextsWithMQR()

get_stuffed_prompt()

GET arguments

Returns

index()

ingest()

ingest_canvas()

ingest_web_text()

mit_download_course()

resource_report()

https://manpages.debian.org/bookworm/manpages-dev/getrlimit.2.en.html

Backend endpoints

Database endpoints (Supabase, QDrant)

Ingest

__init__()

batch_vector_search(search_queries, course_name, top_n=50)

check_for_duplicates(texts, metadatas)

delete_data(course_name, s3_path, source_url)

delete_entire_course(course_name)

format_for_json(found_docs)

format_for_json_mqr(found_docs)

getAll(course_name)

getTopContexts(search_query, course_name, token_limit=4000)

getTopContextsWithMQR(search_query, course_name, token_limit=4000)

get_context_stuffed_prompt(user_question, course_name, top_n, top_k_to_search)

get_stuffed_prompt(search_query, course_name, token_limit=7000)

ingest_coursera(coursera_course_name, course_name)

ingest_github(github_url, course_name)

ingest_single_web_text(course_name, base_url, url, content, title)

reciprocal_rank_fusion(results, k=60)

split_and_upload(texts, metadatas)

AWS endpoints

upload_data_files_to_s3(course_name, localdir)

`add_canvas_users()`

`delete()`

`getAll()`

`getContextStuffedPrompt()`

`getTopContexts()`

`getTopContextsWithMQR()`

`get_stuffed_prompt()`

`index()`

`ingest()`

`ingest_canvas()`

`ingest_web_text()`

`mit_download_course()`

`resource_report()`

`Ingest`

`init()`

`batch_vector_search(search_queries, course_name, top_n=50)`

`check_for_duplicates(texts, metadatas)`

`delete_data(course_name, s3_path, source_url)`

`delete_entire_course(course_name)`

`format_for_json(found_docs)`

`format_for_json_mqr(found_docs)`

`getAll(course_name)`

`getTopContexts(search_query, course_name, token_limit=4000)`

`getTopContextsWithMQR(search_query, course_name, token_limit=4000)`

`get_context_stuffed_prompt(user_question, course_name, top_n, top_k_to_search)`

`get_stuffed_prompt(search_query, course_name, token_limit=7000)`

`ingest_coursera(coursera_course_name, course_name)`

`ingest_github(github_url, course_name)`

`ingest_single_web_text(course_name, base_url, url, content, title)`

`reciprocal_rank_fusion(results, k=60)`

`split_and_upload(texts, metadatas)`

`upload_data_files_to_s3(course_name, localdir)`