import requests
from bs4 import BeautifulSoup
import openai
import math
# Set your OpenAI API key
openai.api_key = "YOUR_API_KEY"
# Function to extract text from a URL
# The extract_text_from_url(url) function is a Python function that takes a single argument, url, which represents
# the URL of a web page. The purpose of this function is to fetch the HTML content of the provided URL, parse it,
# and extract all the text within the paragraph (<p>) tags. The function performs the
# following steps:
#
# 1. Use the requests.get(url) method to fetch the HTML content of the provided URL. The fetched content is
# stored in the response variable.
# 2. Create a BeautifulSoup object called soup by passing the fetched HTML content (response.text) and the
# parser type ('html.parser') as arguments to the BeautifulSoup constructor.
# 3. Find all the paragraph (<p>) tags in the parsed HTML using the soup.find_all('p') method. The result is a list of
# paragraph elements stored in the paragraphs variable.
# 4. Iterate through the paragraphs list and extract the text content of each paragraph element using the
# p.get_text() method. Join the extracted texts with a space separator using ' '.join([p.get_text() for p in
# paragraphs]), resulting in a single string containing all the extracted text.
# 5. Return the extracted text as the output of the function.
#
def extract_text_from_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p')
text = ' '.join([p.get_text() for p in paragraphs])
return text
# Function to split text into chunks of max 4096 tokens
# The split_text(text, max_tokens) function is a Python function that takes two arguments: text, which is a string
# containing the input text, and max_tokens, which is an integer representing the maximum number of tokens
# allowed per chunk. The purpose of this function is to split the input text into smaller chunks, ensuring that each
# chunk has no more tokens than the specified max_tokens.
#
# The function performs the following steps:
# 1. Split the input text into words using the text.split() method. The result is a list of words stored in the words
# variable.
# 2. Calculate the number of chunks needed to divide the text by dividing the total number of words (len(words))
# by the max_tokens and then rounding up using math.ceil(). The result is stored in the num_chunks variable.
# 3. Initialize an empty list named chunks to store the generated text chunks.
# 4. Iterate through the range of num_chunks using a for loop with the index variable i. For each iteration:
# a. Calculate the start and end indices of the words to be included in the current chunk using
# start = i * max_tokens and end = (i + 1) * max_tokens, respectively.
# b. Create a chunk by joining the words in the specified range using ' '.join(words[start:end]). This results in a
# string containing the words from the start index to the end index (excluding the end index).
# c. Append the created chunk to the chunks list.
# 5. After the loop, return the chunks list containing the split text as the output of the function.
#
# NOTE : This is not exactly implemented as I intended. At the time of writing this note (Apr 28, 2023), open GPT 3.5
# API was open to public it has the limit of max token length of 4092. My intention of writing this function were :
# i) split the whole text extracted from the url into chunks with the size of 4092 tokens
# ii) let the model memorize each of the chunks sparately
# iii) then let the model to write summary for the whole / consolidated chunks
# I did this step manually with chatGPT and it seems working as expected.
# But this didn't work as I wanted with API. I had two major issues regarding this
# a) GPT model handles the prompt in the unit of tokens and the token is a little different from words.
# It is difficult to figure out the number of tokens out of the contents of a url. I tried to use tokenizer
# library but it didn't work well. Seems like the usage of tokenizer changes often.
# b) The API based model does not have capaility of just memorizing the prompt.
# I think there might be ways to handle this issue since there is some services to summarize long contents like
# chatPDF, but I didn't find right solution myself and didn't spend much time for this.
#
def split_text(text, max_tokens):
words = text.split()
num_chunks = math.ceil(len(words) / max_tokens)
chunks = []
for i in range(num_chunks):
start = i * max_tokens
end = (i + 1) * max_tokens
chunk = ' '.join(words[start:end])
chunks.append(chunk)
return chunks
# Function to summarize the text using OpenAI API
# The summarize_text(chunks) function is a Python function that takes one argument, chunks, which is a list of text
# chunks. The purpose of this function
# is to summarize each chunk of text using the OpenAI API and return a single summarized string. The function
# performs the following steps:
#
# 1. Initialize an empty list named summaries to store the summaries of each chunk.
# 2. Iterate through the chunks list using a for loop with the variable chunk. For each iteration:
# a. Call the openai.Completion.create() method with the following parameters:
# engine: Set to "text-davinci-003" to use the specified OpenAI engine for generating the summary.
# prompt: Construct the prompt by concatenating "Please summarize the following text: " with the current
# chunk.
# temperature: Set to 0.5 to control the randomness of the output text.
# max_tokens: Set to 500 to limit the length of the generated summary.
# top_p: Set to 1 to sample from the entire probability distribution.
# frequency_penalty: Set to 0 to not penalize frequent words.
# presence_penalty: Set to 0 to not penalize new tokens based on the presence of similar tokens.
# b. Store the API response in the response variable.
# c. Extract the generated summary text from the response by accessing response.choices[0].text and using
# the strip() method to remove any leading or trailing whitespace. Store the summary in the summary variable.
# d. Append the summary to the summaries list.
# 3. After the loop, join the summaries in the summaries list with a space separator using ' '.join(summaries)
# to create a single summarized string.
# 4. Return the single summarized string as the output of the function.
#
def summarize_text(chunks):
summaries = []
for chunk in chunks:
response = openai.Completion.create(
engine="text-davinci-003",
prompt=f"Please summarize the following text: {chunk}",
temperature=0.5,
max_tokens=500,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
summary = response.choices[0].text.strip()
summaries.append(summary)
return ' '.join(summaries)
def main():
# Get the URL from the user
url = input("Enter the URL to fetch text and summarize: ")
# Extract text from the URL
full_text = extract_text_from_url(url)
# Split the text into chunks if needed
max_tokens = 4096 - 150 # Account for the completion tokens
chunks = split_text(full_text, max_tokens)
# Summarize the text using OpenAI API
summary = summarize_text(chunks)
print("\nSummary:")
print(summary)
if __name__ == "__main__":
main()
|