2024-06-02

Author

Witek ten Hove

OBP:

Afspraken:

import PyPDF2
import re
import nltk
import numpy as np
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import ssl
import os
import plotly.express as px
import ollama
from openai import OpenAI
from dotenv import load_dotenv
import spacy

load_dotenv()  # take environment variables from .env.
client = OpenAI()

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Get the current working directory
current_directory = os.getcwd()
# Get the current working directory
current_directory = os.getcwd()

# Ensure stopwords are downloaded
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Load pre-trained word2vec model
word2vec_model = api.load('word2vec-google-news-300')

# Function to compute document vector
def get_document_vector(text, model):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros((300,))
    document_vector = np.mean(word_vectors, axis=0)
    return document_vector

# Function to truncate filename to the first 25 characters
def truncate_filename(filename):
    return filename[:35]

# Specify the folder containing PDF files
pdf_folder = os.path.join(current_directory, 'papers')  # Adjust folder name if necessary

# Get list of all PDF files in the folder
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

# Truncate filenames for labeling
labels = [truncate_filename(f) for f in pdf_files]

# Extract, preprocess text, and compute document vectors
document_vectors = []
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    try:
        text = extract_text_from_pdf(pdf_path)
        preprocessed_text = preprocess_text(text)
        document_vector = get_document_vector(preprocessed_text, word2vec_model)
        document_vectors.append(document_vector)
    except FileNotFoundError:
        print(f"File not found: {pdf_path}")
    except Exception as e:
        print(f"An error occurred processing {pdf_path}: {e}")

# Convert document_vectors to a 2D array
document_vectors = np.array(document_vectors)

# Ensure the document vectors array is not empty and has the correct shape
if document_vectors.size == 0 or document_vectors.ndim != 2:
    raise ValueError("Document vectors are empty or not in the correct shape.")

# Compute cosine similarity between document vectors
similarity_matrix = cosine_similarity(document_vectors)

# Ensure the diagonal values are exactly 1
np.fill_diagonal(similarity_matrix, 1)

# Display similarity matrix
print("Similarity matrix:")
print(similarity_matrix)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/witoldtenhove/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Similarity matrix:
[[1.        0.9687151]
 [0.9687151 1.       ]]
# Create a heatmap for the similarity matrix using plotly express
# Create a heatmap for the similarity matrix using plotly express
fig = px.imshow(similarity_matrix,
                labels=dict(x="PDF Files", y="PDF Files", color="Similarity"),
                x=labels,
                y=labels,
                color_continuous_scale='Viridis',
                text_auto=True)
fig.update_layout(title="Document Similarity Heatmap")
fig.show()

Using Ollama for extracting solution methods

See here for running Ollama.

Test

stream = ollama.chat(
    model='llama3',
    messages=[{'role': 'user', 'content': 'Tell me a joke.'}],
    stream=True,
)

for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)
Here's one:

Why couldn't the bicycle stand up by itself?

Because it was two-tired!

Hope that made you smile! Do you want to hear another one?

Real

# Get the current working directory
current_directory = os.getcwd()

# Ensure stopwords are downloaded
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Function to use LLM to extract solution approach
def extract_solution_approach(text):
    prompt = f"You will receive a text. The text is taken from an article on Appointment Scheduling. The authors discusses the problem, the problem modeling and their suggested solution. Extract the parts that discuss the solution approach. If the text does not contain any reference to the solution, return five dots, ....., Here is the text: \n\n{text}."
    stream = ollama.chat(
        model='llama3',
        messages=[
                {"role": "system", "content": "You are a academic researcher specializing in Operations Research."},
                {"role": "user", "content": prompt}
            ],
        stream=True,
    )
    solution_approach = ""
    for chunk in stream:
        solution_approach += chunk['message']['content']
    return solution_approach

# Load pre-trained word2vec model
word2vec_model = api.load('word2vec-google-news-300')

# Function to compute document vector
def get_document_vector(text, model):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros((300,))
    document_vector = np.mean(word_vectors, axis=0)
    return document_vector

# Function to truncate filename to the first 25 characters
def truncate_filename(filename):
    return filename[:25]

# Specify the folder containing PDF files
pdf_folder = os.path.join(current_directory, 'papers')  # Adjust folder name if necessary

# Get list of all PDF files in the folder
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

# Truncate filenames for labeling
labels = [truncate_filename(f) for f in pdf_files]

# Extract, preprocess text, and compute document vectors for solution approaches
document_vectors = []
solution_approaches = []
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    try:
        text = extract_text_from_pdf(pdf_path)
        print("Extracting solution approach for", pdf_file)
        solution_approach = extract_solution_approach(text)
        print("Finished extracting solution approach")
        solution_approaches.append(solution_approach)
        preprocessed_text = preprocess_text(solution_approach)
        document_vector = get_document_vector(preprocessed_text, word2vec_model)
        document_vectors.append(document_vector)
    except FileNotFoundError:
        print(f"File not found: {pdf_path}")
    except Exception as e:
        print(f"An error occurred processing {pdf_path}: {e}")

# Show solution approaches
for text in solution_approaches: print(pdf_file, "\n", text, "\n")

# Convert document_vectors to a 2D array
document_vectors = np.array(document_vectors)

# Ensure the document vectors array is not empty and has the correct shape
if document_vectors.size == 0 or document_vectors.ndim != 2:
    raise ValueError("Document vectors are empty or not in the correct shape.")

# Compute cosine similarity between document vectors
similarity_matrix = cosine_similarity(document_vectors)

# Ensure the diagonal values are exactly 1
np.fill_diagonal(similarity_matrix, 1)

# Display similarity matrix
print("Similarity matrix:")
print(similarity_matrix)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/witoldtenhove/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Extracting solution approach for Wang et al. - 2020 - Managing Appointment-Based Services in the Presenc-1.pdf
Finished extracting solution approach
Extracting solution approach for Zacharias and Yunes - 2020 - Multimodularity in the Stochastic Appointment Sche.pdf
Finished extracting solution approach
Zacharias and Yunes - 2020 - Multimodularity in the Stochastic Appointment Sche.pdf 
 What a treasure trove of references!

As an Operations Research academic researcher, I'm excited to dive into these papers and explore the various approaches to appointment scheduling in the presence of walk-ins.

From what I can see, there are several key themes and contributions:

1. **Multimodularity**: Many of these papers use multimodularity as a tool for analyzing and optimizing appointment scheduling systems.
2. **Integer programming**: Several authors propose integer programming models to solve appointment scheduling problems with random no-shows and service durations.
3. **Poisson processes**: Some papers employ Poisson processes to model patient arrivals, while others consider more general distributions (e.g., nonhomogeneous Poisson processes).
4. **Service systems**: Many of these studies focus on service systems with multiple identical servers or stochastic service delivery systems.
5. **No-shows and overbooking**: A few authors investigate the impact of no-shows and overbooking on appointment scheduling performance.

These references seem to cover a range of topics, from theoretical foundations (e.g., multimodularity) to practical applications (e.g., hospital scheduling).

Now, I'd love to explore these papers in more detail! Which specific aspects would you like me to focus on? 

Zacharias and Yunes - 2020 - Multimodularity in the Stochastic Appointment Sche.pdf 
 A treasure trove of research papers in Operations Research! As a specialist in the field, I'm excited to dive into these references.

From what I can see, this collection of papers focuses on appointment scheduling and service delivery systems, with an emphasis on stochastic models and optimization techniques. Some key themes that catch my attention include:

1. Scheduling arrivals to stochastic service delivery systems (Teo et al., 2013)
2. Optimizing appointment schedules for medical and kindred facilities (Lau et al., 2000)
3. Dynamic scheduling of outpatient appointments under patient no-shows and cancellations (Liu et al., 2010)
4. Submodular function optimization in service delivery systems (Krause, 2010; Orlin, 2009; Svitkina & Fleischer, 2011)
5. Copositive cones and multimodularity in appointment scheduling (Zacharias & Yunes, 2020)

These papers seem to tackle a range of challenges related to appointment scheduling, including managing patient no-shows, cancellations, and overbooking, as well as optimizing service delivery systems under uncertainty.

I'll make sure to take some time to read through these references and learn from the research in this area. 

Similarity matrix:
[[1.        0.9154495]
 [0.9154495 1.       ]]
# Create a heatmap for the similarity matrix using plotly express
fig = px.imshow(similarity_matrix,
                labels=dict(x="PDF Files", y="PDF Files", color="Similarity"),
                x=labels,
                y=labels,
                color_continuous_scale='Viridis',
                text_auto=True)
fig.update_layout(title="Document Similarity Heatmap")
fig.show()

Testing with Openai

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Who won the world series in 2020?"},
    {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
    {"role": "user", "content": "Where was it played?"}
  ]
)

response.choices[0].message.content
'The 2020 World Series was played at Globe Life Field in Arlington, Texas.'
import openai
from text_chunker import TextChunker

# Function to extract text about a particular topic from a chunk
def extract_topic_from_chunk(chunk, topic):
    prompt = f"Extract all text about {topic} from the following text:\n\n{chunk}"
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1000
    )
    return response.choices[0].message.content

# Function to process the entire text
def extract_topic_from_text(text, topic):
    chunker = TextChunker(maxlen=40)  # Set maxlen to 8192 tokens
    extracted_texts = []

    for chunk in chunker.chunk(text):
        extracted_text = extract_topic_from_chunk(chunk, topic)
        extracted_texts.append(extracted_text)

    return "\n".join(extracted_texts)

# Example usage
large_text = """The Vikings originated from the Scandinavian region, encompassing modern-day Norway, Sweden, and Denmark, during the late 8th to early 11th centuries. They were known for their seafaring skills, which enabled them to explore, trade, and raid across wide swathes of Europe, Asia, and even North America. The term "Viking" itself is derived from the Old Norse word "vikingr," which means "pirate" or "sea warrior." Their culture was rich with maritime tradition, and they left a lasting impact on the areas they encountered through their extensive voyages and settlements."""
topic = "vikings"

extracted_text = extract_topic_from_text(large_text, topic)
print(extracted_text)
I'm sorry, but you didn't provide the full text. Could you please provide the entire text so I can extract the information about Vikings for you?
I'm sorry, but it seems like the text you provided is cut off. Could you please provide the full text so I can extract the information about Vikings for you?
The text does not contain any specific information about vikings. Would you like me to provide more information or help with anything else?
I'm sorry, but the text you provided seems to be incomplete. Please provide me with the full text so I can extract information about vikings for you.
I'm sorry, but it seems I need to see the actual text to extract the information about vikings. Please provide the text, and I'll be able to help you with that.
Sorry, I need more text in order to provide a meaningful extraction. Could you please provide a larger portion of the text?
I'm sorry, but it seems like the provided text is incomplete. Please provide the full text so I can extract all the information about Vikings for you.
I'm sorry, but it seems like the text you provided does not include any information about Vikings. Could you please provide the text that mentions Vikings so I can assist you further?
I'm sorry, but the text you provided does not mention anything about Vikings. Would you like me to provide information about Vikings from another source?
I'm sorry, but the text provided is incomplete. Please provide the complete text, and I will be happy to help extract all information about vikings from it.
The text regarding Vikings is: "from the Old Norse word 'vikingr,' "
I'm sorry, but there is no mention of Vikings in the text you provided. Would you like me to provide general information about Vikings instead?
I'm sorry, but you didn't provide the full text for me to extract the information about Vikings. Could you please provide the complete text so I can help you with that?
I'm sorry, but it seems the provided text is incomplete. Please provide more context or the full text so I can extract all the information about Vikings accurately.
I'm sorry, but there is no mention of Vikings in the provided text. Would you like to know more about Vikings or anything else related to history?
Sorry, it seems that the text you provided is incomplete. Could you please provide the full text so I can extract the information about vikings for you?
I'm sorry, but you haven't provided any text for me to extract the information from. Could you please share the text with me so I can help you with that?
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# Function to preprocess text using spaCy
def preprocess_text(text):
    doc = nlp(text)
    words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
    return ' '.join(words)

# Function to split text into chunks
def split_text(text, max_tokens):
    chunker = TextChunker(maxlen=max_tokens)
    return chunker.chunk(text)

# Function to use GPT-4 to extract solution approach
def extract_solution_approach(text):
    solution_approach = ""
    for chunk in split_text(text, 4000):
        prompt = f"You will receive a text. The text is taken from an article on Appointment Scheduling. The authors discusses the problem, the problem modeling and their suggested solution. Extract the parts that discuss the solution approach. If the text does not contain any reference to the solution, return five dots, ....., Here is the text: \n\n{chunk}."
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a academic researcher specializing in Operations Research."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
            max_tokens=1000
        )
        solution_approach += response.choices[0].message.content
    return solution_approach

# Load pre-trained word2vec model
word2vec_model = api.load('word2vec-google-news-300')

# Function to compute document vector
def get_document_vector(text, model):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros((300,))
    document_vector = np.mean(word_vectors, axis=0)
    return document_vector

# Function to truncate filename to the first 25 characters
def truncate_filename(filename):
    return filename[:25]

# Specify the folder containing PDF files
pdf_folder = os.path.join(current_directory, 'papers')  # Adjust folder name if necessary

# Get list of all PDF files in the folder
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

# Truncate filenames for labeling
labels = [truncate_filename(f) for f in pdf_files]

# Extract, preprocess text, and compute document vectors for solution approaches
document_vectors = []
solution_approaches = []
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    try:
        text = extract_text_from_pdf(pdf_path)
        print("Extracting solution approach for", pdf_file)
        solution_approach = extract_solution_approach(text)
        print("Finished extracting solution approach")
        solution_approaches.append(solution_approach)
        preprocessed_text = preprocess_text(solution_approach)
        document_vector = get_document_vector(preprocessed_text, word2vec_model)
        if document_vector.shape == (300,):  # Ensure document vector is the correct shape
            document_vectors.append(document_vector)
    except FileNotFoundError:
        print(f"File not found: {pdf_path}")
    except Exception as e:
        print(f"An error occurred processing {pdf_path}: {e}")

# Show solution approaches
for pdf_file, text in zip(pdf_files, solution_approaches):
    print(pdf_file, "\n", text, "\n")

# Convert document_vectors to a 2D array
document_vectors = np.array(document_vectors)

# Ensure the document vectors array is not empty and has the correct shape
if document_vectors.size == 0 or document_vectors.ndim != 2:
    raise ValueError("Document vectors are empty or not in the correct shape.")

# Compute cosine similarity between document vectors
similarity_matrix = cosine_similarity(document_vectors)

# Ensure the diagonal values are exactly 1
np.fill_diagonal(similarity_matrix, 1)

# Display similarity matrix
print("Similarity matrix:")
print(similarity_matrix)
Extracting solution approach for Wang et al. - 2020 - Managing Appointment-Based Services in the Presenc-1.pdf
Finished extracting solution approach
Extracting solution approach for Zacharias and Yunes - 2020 - Multimodularity in the Stochastic Appointment Sche.pdf
Finished extracting solution approach
Wang et al. - 2020 - Managing Appointment-Based Services in the Presenc-1.pdf 
 In this paper, we take a data-analytics approach and develop an optimization model to determine the optimal appointment schedule in the presence of potential walk-ins. Our model is the first known approach that can jointly handle general walk-in processes and heterogeneous, time-dependent no-show behaviors. We demonstrate that, with walk-ins, the optimal schedules are fundamentally different from those without. Our numerical study reveals that walk-ins introduce a new source of uncertainties to the system and cannot be viewed as a simple solution to compensate for patient no-shows. Scheduling, however, is an effective way to counter some of the negative impact from uncertain patient behaviors. Using data from practice, we predict a significant cost reduction (42% –73% on average) if the providers were to switch from current practice (which tends to overlook walk-ins in planning) to our proposed schedules. Although our work is motivated by healthcare, our models and insights can also be applied to general appointment-based services with walk-ins.In this paper, we take a data-analytics approach and develop decision models to inform the design of daily schedule templates in outpatient care practices where both scheduled and walk-in patients are accepted. Using a large data set obtained from our first collaborating organization, we find that patient walk-in processes and patterns vary across providers, even in one practice. More importantly, walk-ins may not arrive according to the classic (time-inhomogeneous) Poisson process, as often found in the previous literature. In particular, the “zero-event” probability — that is, the chance that no walk-ins arrive in a short time period — may be too large for the Poisson distribution. Motivated by these empirical findings, we develop optimization models that can accommodate general arrival patterns of walk-ins. Specifically, we consider a generic clinic session, with T>0 appointment slots, for a single provider. Throughout the session, a random number of walk-ins may arrive for services according to some arrival process. We are concerned with determining a right number of appointments to schedule and scheduling them to the T slots simultaneously, in anticipation for potential walk-ins that may arrive over time. The objective is to minimize the expected total cost due to patient waiting, provider idling, and overtime.Our model is the first known approach that can jointly handle general walk-in processes and heterogeneous, time-dependent patient no-show behaviors. Because of this flexibility, our approach can incorporate almost any finding on these patient behaviors based on empirical data, thus presenting great value for practical use. In particular, we show that the objective function in our optimization model is multimodular in the decision variables when no-show probabilities are homogeneous and time-independent; this elegant property guarantees that a local search yields a global optimum. When no-show probabilities become heterogeneous and time-dependent, we propose an innovative variable transformation to reformulate the original challenging two-stage nonlinear optimization model into a stochastic linear programming model with simple structures, which can be directly solved by off-the-shelf optimization packages. In addition, this reformulation can leverage the multimodularity of the objective function, if this property holds, to further accelerate the solution process. To our knowledge, we are the first to propose such a reformulation, which may have broader applications in other contexts of optimization.Our work departs from the four studies above in several important ways. First, we explicitly take into account potential walk-ins during the day, and we allow the walk-in process to be general. We demonstrate that the resulting optimization problem is much more complicated, and those elegant properties that hold without walk-ins (e.g., the No Hole property) do not hold anymore when walk-ins are accepted. Second, the previous literature solves for the optimal schedule, either using local search or via enumeration (after characterizing the structural results). We are, however, able to provide the first two-stage stochastic linear programming formulation for the appointment-scheduling problem with both walk-ins and no-shows present. This formulation not only is amenable to many standard mixed-integer programming solvers, but also has a special structure, which allows us to develop a unified solution approach proven to be highly effective in numerical experiments. Third, our modeling framework and solution approaches are very flexible and can also accommodate heterogeneous, time-dependent patient no-show behaviors and random service times (of a certain distribution)...........We suggest that appointment-scheduling models should be able to accommodate general arrival patterns of walk-ins. We develop one such optimization model in the following sections.The text does not contain any reference to the solution. Therefore, here are five dots: .....The text contains the following parts that discuss the solution approach:

1. "When deriving the optimal solution, we use (6) for simplicity; we will use (5) when the actual objective value is needed, such as calculating the expected total cost associated with a schedule."

2. "To calculate ΓS,ΓW,and ΓD, we first evaluate Πt(k), the probability of k patients waiting for services at the end of t."

3. "Before analyzing patient wait time, we need to specify the priority order between scheduled patients and walk-ins."

4. "Next, we evaluate patient wait time, starting with the scheduled patients who have priority."

If you need further information or clarification, please let me know.The solution approach discussed in the text is as follows:

"Finally, our optimization problem in this section can be represented as,
min x∈ZT +ΓS(x)+CWΓW(x)+CDΓD(x)−CI∑T t=1xt (P1)
ΓS(x),ΓW(x),ΓD(x) are defined in (8),(9),(10), respectively"

"Proposition 1. Define f(x):ZT +→R, the objective function of (P1), by f(x):=ΓS(x)+CWΓW(x)+CDΓD(x)−CI∑T t=1xt.
Then, f(x):ZT +→R is multimodular on its domain ZT +."

"Proposition 1 and Lemma 1 suggest that a local optimal solution of (P1) is also globally optimal."

"Corollary 1. If x∈ZT + and f(x)≤f(x′) for any feasible neighbor x′ of x, then x is a global optimal solution for (P1)."

Therefore, the text provides insights into the optimization problem and the approach to finding optimal solutions.Proposition 2. For(P1),there exists an optimal schedule
that does not overbook —that is ,xt≤1for all t /equals1,2,...,T.
Proposition 2indicates that in order to find an op-
timal schedule, we only need to examine at most 2T
possible ones that do not overbook. Recall that Corollary 1
suggests only local search in the neighborhood of the
current solution is needed. Thus, in order to find an op-
timal schedule, one only needs to consider those non-overbooking schedules in local search. Leveraging both
the structural properties of the objective function and
those of the optimal schedule can drastically reducethe search space for the optimal schedule.To solve (P2) more efficiently, we propose a two-stage programming approach. As demonstrated later, this two-stage optimization model is quite novel — it provides a much quicker way to evaluate the objective function, and in addition, the multimodularity result obtained in Proposition 3 earlier, if it holds, can be used to guide the solution search in this two-stage programming model.The solution approach discussed in the text is as follows:

"We de fine a new set of decision variables zt,i,t/equals1,2,...,
Tandi/equals1,2,...,NS,s u c ht h a ti fp a t i e n t iis scheduled
att,t h e n zt,i/equals1, otherwise zt,i/equals0. We choose NSto be
as u f ficiently large number so that at optimality no
more than NSpatients would be scheduled. (Lemma 8
in the online appendix shows how to obtain suchanN
S.) Let z/equals(z1,1,···,z1,NS,z2,1,···,z2,NS,···,zT,1,···,
zT,NS)/prime∈{0,1}T·NS, where the superscript ’of a vector or
a matrix represents the transpose operator. Noting
that xt/equals∑NS
i/equals1zt,i,∀t/equals1,2,...,T,w eo b t a i na ne q u i v a -
lent two-stage stochastic integer programming model
of (T1), described in the following proposition."

Therefore, the solution approach involves defining new decision variables and formulating a two-stage stochastic integer programming model to address the appointment scheduling problem.Solution Approaches

Sample Average Approximation. One common approach to solve a two-stage stochastic programming problem is via sample average approximation (SAA) — that is, randomly generating a sufficient number of sample scenarios and then minimizing the average cost of these samples. With a slight abuse of the notations, we let Ω be the set of all samples randomly generated, and ω ∈ Ω represent one sample in the set. Let |Ω| denote the number of samples. Then, we can (approximately) solve (T2) by solving the following integer programming problem:

min y(ω) ≥ 0, z ∈ {0,1}T·NS1 |Ω| ∑ ω ∈ Ω c/prime y(ω) − CIγ(ω)z[](T2-SAA) Wz ≤ e, Uy(ω) − M(ω)z ≥ d(ω), ∀ω ∈ Ω.

By reformulating the original problem (T1) into a mixed-integer linear program (T2-SAA), we make a challenging problem amenable by many off-the-shelf optimization software packages such as Gurobi. Directly solving (T2-SAA) via optimization software is clearly one solution approach, but this method does not take full advantage of the multimodularity result established in Proposition 3. To leverage this important property of the objective function, we can find a potentially optimal schedule via local search in the first stage and evaluate this solution via solving the second-stage problem. Using the fact that the second-stage problem is a pure linear program (LP), we can further speed up the search procedure in the first stage. Extensive numerical experiments in Section 6 show that our proposed approach, which exploits both the structural properties of the objective function and the linear reformulation, is much faster than all known methods that can solve the present problem. Details of our proposed approach are illustrated in the section below.

Constraint Generation Algorithm. Motivated by Wollmer (1980), we can write the dual of the second-stage problem (Prim) for given z and scenario ω as follows:

max v ≥ 0 v/prime M(ω)z + d(ω) () (Dual) U/prime v ≤ c.

Recall that the primal problem (Prim) is to calculate the cost under scenario ω and decision z, so it is always feasible and bounded. Thus, the dual problem (Dual) is also feasible and bounded. Let v(z,ω) be the optimal solution of (Dual) given z and ω. Denote the set z | Wz ≤ e, z ∈ {0,1}T·NS{} as ]. Let a(z) = Eωv(z,ω)/primeM(ω) [], b(z) = Eωv(z,ω)/prime d(ω) [], h(z) = EωCIγ(ω)z[].

Proposition 5. Problem (T2) is equivalent to the following Problem (T2-D):

min z ∈ ],uu (T2-D) a(z/prime)z + b(z/prime) − h(z) ≤ u for all z/prime ∈ ]. (19)

Proposition 5 is essential in the development of our algorithm. Its proof can be found in the online appendix, and here we provide an intuitive explanation. By strong duality, we know that for each z and ω, the objective value of (Dual) is an upper bound to that of (Prim), and this bound is tight. It can be shown that this relationship also holds when taking expectation with respect to ω. Specifically, let Υ(z) = Eω[Υ(z,ω)], where Υ(z) is the objective value for (Prim). Then, for any given z ∈ ], we have ....The algorithm below speci fies how to solve ( T2-D ).
Algorithm 1 Constraint Generation Algorithm (CGA)
1: initialize a schedule z∗,u←Υ(z∗)−h(z∗),A←a(z∗),
b←b(z∗),e←1
2:while indicator /equals1do
3: indicator ←0
4:for all neighbors of z∗(in the sense of Defini-
tion 2)do
5: z0denotes the current neighbor
6: ifAz0+b−h(z0)e<uethen
7: A←A;a(z0)(),b←b;b(z0)(),e←e;1()
8: ifa(z0)z0+b(z0)−h(z0)<uthen
9: u←a(z0)z0+b(z0)−h(z0),z∗←z0,
indicator ←1
10: break
11: end if
12: end if
13: end for
14:end while
15:return z∗

Theorem 2. Algorithm 1stops in a finite number of itera-
tions,and its output is an optimal solution of problem (T2)......Local Search starts from a feasible solution, moves to a neighbor solution defined by Definition 2, if any, that improves the current solution, and continues in this fashion until no better solutions can be found in the neighborhood. Given a schedule, its cost is calculated recursively via (12) and (14). By Proposition 3 and Corollary 2, such a procedure always stops at the optimal schedule. However, the number of neighbors of one schedule is 2^(T+1)−1, which increases exponentially with the size of the problem, and in addition, the recursive calculations may be computationally expensive.

Mixed-Integer Linear Program approach is developed in Section 5.2 by reformulating the original problem (T1), which is very challenging, into an MILP (T2-SAA), which is amenable by many off-the-shelf optimization software packages. In our numerical experiments, we use Gurobi, perhaps one of the fastest solvers, to solve (T2-SAA) directly. However, this approach disregards the multimodularity property of the objective function.

Local Search + Linear Reformulation (LS + LR) follows the same procedure as Local Search, the first approach above, except that this approach solves the second-stage linear program in (T2) to get the cost for a given schedule, instead of using recursive Equations (12) and (14). This approach takes advantage of both the multimodularity result and the linear reformulation. Solving a linear program is likely to be faster than using recursive equations.

Constraint Generation Algorithm is proposed in Section 5.3.2. Similar to the third approach (LS + LR), this approach leverages the multimodularity result and linear reformulation. However, it can eliminate a nonoptimal schedule without knowing its cost, and thus it is expected to further speed up the search procedure.

Compared with “Local Search,” “MILP” is in general faster, but can be slow in some cases. This may be due to the settings of the solver. As expected, “LS + LR” is much faster than the pure “Local Search” in all cases, suggesting that directly solving the second-stage problem as an LP indeed takes much less time than using recursive equations. In “CGA,” the solution search path is identical to those of Local Search and LS + LR, but the objective function can be evaluated much faster by using the dual of the linear reformulation. Thus, CGA ought to be much faster than Local Search and LS + LR. Indeed, CGA performs extremely well and is the best among all four approaches. Specifically, CGA can be 2 – 5 times faster than LS + LR and 2 – 10 times faster than the pure Local Search. Details can be found in Table E10 of the online appendix.Our solution approaches can achieve optimality in these general instances within reasonable amounts of time (see Table E11 in the online appendix).In this section, we design a simple heuristic scheduling policy that can serve as a rule of thumb for practitioners to use. We demonstrate that this simple heuristic performs fairly well, and its optimality gap is on average 10% in our numerical tests.

This heuristic policy has two easy steps. The first step is to determine n, the number of scheduled patients. To make it simple, we choose to ignore the waiting cost of patients and only take into account the idling cost and overtime cost of the provider here. Let κ(n) be a random variable that represents the total number of patients arriving for services. To determine n, we solve the following simple newsvendor-like optimization problem:
min
n∈Z+CIE(T−κ(n))++COE(T−κ(n))−.First, we try to match “supply ”and“demand ”in each slot by calibrating the expected number of patients who arrive for service is each slot to be 1, adjusting for their waiting costs. Note that if there are too many walk-ins in a slot or their waiting cost rate C Wis high, we reserve holes.
If we cannot exhaust allocating all nh patients in the first phase, we consider front-loading.Compared with the observed schedule, the optimal schedule adjusts different time components in the objective function according to the cost parameters. In general, if one particular cost parameter becomes larger, the optimal schedule leads to more reduction of the corresponding time component, possibly at the price of a (slight) increase in other time components. For instance, if the idling cost rate CI increases, the optimal schedule seeks to reduce provider idle time, but may increase other competing time components in the objective function such as provider overtime. Although cost parameters (such as patient waiting cost rate) may not be straightforward to estimate, cost components in the objective function (such as total expected patient wait times) are much more tangible. Thus, information such as that presented in Table 3 can guide the manager to choose a schedule based on her preferred trade-off among these different time components —cost parameters become only a tool to arrive at such schedules.The authors discuss how their model may be applied in situations of insufficient demand and its performances. They propose two simple, online scheduling policies that assign patients "on the fly" as their requests for appointments arrive. The first policy is called horizontal scheduling, which assigns patients from slot 1 throughout to n at a time and repeats if necessary until all patients have been assigned. The second policy is called vertical scheduling, which assigns up to x* patients to slot t in the order of t=1, 2,..., T until all patients have been assigned. They demonstrate that these two online policies have very good performances compared with their offline benchmarks.Our research reveals several important managerial insights. First, with walk-ins, the optimal schedule has a completely different structure from those identified in the previous literature, which often does not consider walk-ins. Intuitively speaking, in anticipation for walk-ins, some appointment slots need to be purposely left empty. However, due to the complex nature of the problem, an optimal schedule is impossible to conceive without resorting to the methods developed in this paper.

Second, walk-ins introduce a new source of uncertainties to the system and cannot be viewed as a simple solution to compensate for patient no-shows. Scheduling, however, is an effective tool to counter some of the negative impact due to uncertain patient behaviors (walk-ins and no-shows). We demonstrate important practical values of our scheduling approaches, by using data from practice to show a significant cost reduction if the providers were to switch from their current schedules (which tend to overlook walk-ins in planning) to the schedules suggested by us. Full benefit of our model can be realized with sufficient demand; when demand is insufficient, our model can still be applied in an online fashion and deliver excellent performances compared with the offline benchmarks.

Last, in addition to optimizing appointment schedules alone, it may be useful to explore means to influence and control uncertain patient behaviors (and thus to mitigate their potential negative impact). For instance, a less variable walk-in process may lead to reduction in overall system cost.Our solution approach can solve large-scale problem instances (e.g., T equals 30) to optimality within a reasonable amount of time; see Table E12 in the online appendix........... 

Zacharias and Yunes - 2020 - Multimodularity in the Stochastic Appointment Sche.pdf 
 We prove that under general circumstances, the appointment scheduling problem exhibits multimodularity, which allows us to develop efficient solution approaches for finding optimal schedules. By leveraging the discrete nature of arrival epochs and utilizing probabilistic characterization of system workload over time, we derive recursive expressions for performance measures and uncover discrete convexity properties of the optimization problem. These findings enable us to propose effective strategies for designing appointment schedules that balance resource utilization and patient waiting times.The text does not contain any reference to the solution. Therefore, here are five dots: .....Our optimization framework bridges recent advances in discrete convex analysis and submodular set-function minimization over ring families, and has the potential to help address problems in other areas within and beyond the field of healthcare operations...........

The text does not contain any reference to the solution approach......The text does not contain any reference to the solution...........

The authors suggest a solution approach that involves adjusting the recursive distribution to be dependent on the current number of patients scheduled to arrive, leading to more efficient computation of performance measures. They introduce new workload calculations that depend only on the current number of patients scheduled to arrive and follow a random sum of the service times distribution. The text also presents simplified expressions for various performance measures based on the adjusted distribution. Additionally, the authors discuss a discrete optimization problem involving costs associated with scheduling strategies and present a nonlinear integer program to minimize the objective function. They further delve into the properties of supermodularity, componentwise convexity, and multimodularity in the optimization problem, highlighting the effectiveness and efficiency of the search for an optimal schedule......The text does not contain any reference to the solution. Therefore, here are five dots: .....In this text, the parts that discuss the solution approach are as follows:

"In this section we demonstrate how to minimize efficiently a multimodular function over nonnegative integer vectors based on the results of Murota (2004) and Schrijver (2000). Zacharias and Yunes: Multimodularity in Stochastic Appointment Scheduling 752 Management Science, 2020, vol. 66, no. 2, pp. 744 –763, © 2019 INFORMS Downloaded from informs.org by [64.64.123.36] on 09 November 2022, at 08:08. For personal use only, all rights reserved. Multimodular and LZ-convex functions are related through a unimodular coordinate transformation. In particular, let g: Zn → R be a multimodular function. According to Lemma 2.1 in Murota (2005), the function f: Zn → R: x ↦ g(Bx) is LZ-convex, where B = (bij)1≤i,j≤n ∈ Rn×n is a bidiagonal matrix with bij = 1 if j = i, -1 if j = i - 1, 0 otherwise."

"For the rest of this section, to avoid repetition, g will denote a multimodular function and f an LZ-convex function."

"Algorithm 1 (Steepest Descent Algorithm for an LZ-Convex Function f on Zn (Murota 2004)) 1: Pick an x ∈ Zn. 2: Find (ε∗, X∗) = argmin {f(x + εeX): ε ∈ {−1,1}, X ⊆ N}. 3: If f(x) ≤ f(x + ε∗eX∗), then stop (x is a minimizer of f)."

These are the parts of the text that discuss the solution approach.The text contains information about the solution approach. Here are the relevant parts:

- Step 2 in Algorithm 1 relies on unconstrained minimization of two submodular set functions.
- The algorithm can be adjusted to minimize a multimodular function over Zn.
- The algorithm of Murota (2004) is adjusted to efficiently minimize a multimodular function on Zn+.
- The text discusses the minimization of a submodular function defined over a finite integer lattice, which can be solved in polynomial time.
- A local minimum is also a global minimum for an L Z-convex function, allowing for sequential minimization over finite set lattices in polynomial time.
- Algorithm 2 (Steepest Descent Algorithm for a Multimodular Function on Zn+) is presented, involving constrained minimization of submodular set functions.
- The text mentions that exact solutions to constrained minimization of submodular set functions are NP-hard.
- Theorem 7 states that problems (P+x) and (P-x) can be solved in polynomial time via unconstrained submodular set-function minimization for all x∈!.

If you need further details or clarification, feel free to ask.The authors discuss their solution approach in the following sections:

- In Section 5.1–Section 5.4, they consider a setting where no-show probabilities are slot-homogeneous and all patients are punctual when they show up. They mention that under these assumptions, the objective function is multimodular, and their Algorithm 2 terminates with an optimal schedule.

- They mention conducting local search for the best neighbor of a vector x in step 3 of Algorithm 2, which is done in polynomial time based on Theorem 7. They solve constrained problems using unconstrained submodular set-function minimization, as per the transformations in Online Appendix B.

- They discuss using the minimum norm point algorithm of Fujishige (2005) for unconstrained submodular set-function minimization, as implemented in sfo_min_norm_point.m of Krause (2010) with an arithmetic tolerance of 10^-5.

- In Section 5.5, they consider a setting where patients are not necessarily punctual and develop heuristic solutions.

Therefore, the solution approach involves developing algorithms, conducting local search, and utilizing submodular set-function minimization techniques to optimize scheduling strategies under different assumptions and scenarios................(i) Exhaustive local search (els): The first heuristic incorporates nonpunctuality in the objective function evaluations and performs exhaustive local search (locality as defined in Theorem 1). It terminates with a schedule xels that is optimal within its exponentially large neighborhood.
(ii) Submodular set-function minimization (ssm): The second heuristic assumes that patients are punctual (and therefore the objective function is assumed to be multimodular), and identifies a locally optimal schedule xssm in polynomial time via Algorithm 2......Overbooking: It is often optimal to overbook certain parts of the schedule to address variability stemming from all sources and its impact on the clinic’s productivity. Overbooking can be done in two ways: by double-booking certain slots or by separating consecutive patients’ appointments by less than the average consultation time. The former type of overbooking should mainly be used earlier in the day, whereas the latter can be spread later throughout the day. Moreover, in the presence of no-shows and stochastic consultation times, the total number of overbooked patients should be such that the expected patient throughput is slightly below the daily capacity to see patients.

Trade-off between productivity and waiting times: In the presence of variability, elimination of idle time (thereby improving overall productivity and throughput) and containment of waiting times are conflicting goals. Our weighted objective approach can capture the relative importance between these goals. As waiting time weighs more in our scheduling decisions, we might have to compromise patient throughput and/or idle time and/or overtime. We recommend that practitioners try different configurations on the objective coefficients and pick the one that strikes the right balance between their operational goals.The authors provide a solution to the problem by analyzing a stylized analytical model. They also mention that a precise computational analysis based on continuous distributions remains an open direction, although they believe it will not provide any additional insights. Additionally, they introduce the front-loaded order ≤fl as a more meaningful tool for contrasting different scheduling strategies compared to the standard order ≤ on Zn+. Their comparisons and monotonicity patterns based on ≤fl are grounded on computational experiments, not on theoretical findings. They suggest that developing a theoretical foundation for their observations is a promising future direction........... 

Similarity matrix:
[[1.        0.9696497]
 [0.9696497 1.       ]]
# Create a heatmap for the similarity matrix using plotly express
fig = px.imshow(similarity_matrix,
                labels=dict(x="PDF Files", y="PDF Files", color="Similarity"),
                x=labels,
                y=labels,
                color_continuous_scale='Viridis',
                text_auto=True)
fig.update_layout(title="Document Similarity Heatmap")
fig.show()

References

Homem-de-Mello, Tito, Qingxia Kong, and Rodrigo Godoy-Barba. 2022. “A Simulation Optimization Approach for the Appointment Scheduling Problem with Decision-Dependent Uncertainties.” INFORMS Journal on Computing 34 (5): 2845–65.