vdb.py

import requests

from bs4 import BeautifulSoup

import pinecone
from langchain_community.vectorstores import Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain_community.document_loaders.merge import MergedDataLoader


from consts import PINECONE_API_KEY, PINECONE_CLOUD, PINECONE_INDEX_NAME, PINECONE_DIMENSION, PINECONE_METRICS, CHUNK_OVERLAP, CHUNK_SIZE

def get_content_from_webpage(url):
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        content = soup.select('div#content').pop().get_text(separator='\n', strip=True)
        return content
    except Exception as e:
        print(f"Exception occured while trying to fetch compliance policy: #{e}")

def get_webpages_content():
    webpages = [
        'https://www.hackerearth.com/recruit/tech-recruiters/',
        'https://www.hackerearth.com/recruit/hiring-managers/',
        'https://www.hackerearth.com/recruit/university-hiring/',
        'https://www.hackerearth.com/recruit/remote-hiring/',
        'https://www.hackerearth.com/recruit/learning-and-development/'
        'https://www.hackerearth.com/recruit/#/'
    ]

    documents = ""
    for webpage in webpages:
        documents += get_content_from_webpage(webpage)
    return documents

def read_doc(directory="doc/"):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load_and_split()
    return documents

def get_vector_search_index(chunks):
    embeddings = OpenAIEmbeddings()

    pinecone.init(
        api_key=PINECONE_API_KEY,
        environment=PINECONE_CLOUD
    )

    if PINECONE_INDEX_NAME in pinecone.list_indexes():
        vector_search_index = Pinecone.from_existing_index(PINECONE_INDEX_NAME, embeddings)
    else:
        pinecone.create_index(
            PINECONE_INDEX_NAME,
            dimension=PINECONE_DIMENSION,
            metric=PINECONE_METRICS
        )
        vector_search_index = Pinecone.from_documents(
            chunks,
            embeddings,
            index_name = PINECONE_INDEX_NAME
        )

    return vector_search_index

def retrieve_query(documents, query, k=4):
    index = get_vector_search_index(documents=documents)
    matching_results = index.similarity_search(query=query, k=k)
    return matching_results

def process_pdf(directory='doc/', chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
    loader = PyPDFDirectoryLoader(directory)

    data = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    documents = text_splitter.split_documents(data)

    return documents

def chunk_data(documents, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function = len,
    )
    doc = text_splitter.create_documents([documents])
    return doc


def init_vdb():
    content = get_webpages_content()
    chunked_content = chunk_data(content)

    chunked_document = process_pdf()

    chunked = chunked_content + chunked_document

    index = get_vector_search_index(chunked)
    return index

init_vdb()