diff --git a/README.md b/README.md
index 1a5f0a4534..88980d5f72 100644
--- a/README.md
+++ b/README.md
@@ -130,7 +130,7 @@ pip install -e ".[dev, tutorials]"
 
 Here's a quick run down of the main components of a Bayesian optimization loop.
 For more details see our [Documentation](https://botorch.org/docs/introduction) and the
-[Tutorials](https://botorch.org/tutorials).
+[Tutorials](https://botorch.org/docs/tutorials).
 
 1. Fit a Gaussian Process model to data
   ```python
diff --git a/botorch/models/cost.py b/botorch/models/cost.py
index e7a710c8fa..604e749fe5 100644
--- a/botorch/models/cost.py
+++ b/botorch/models/cost.py
@@ -9,7 +9,7 @@
 
 Cost are useful for defining known cost functions when the cost of an evaluation
 is heterogeneous in fidelity. For a full worked example, see the
-`tutorial <https://botorch.org/tutorials/multi_fidelity_bo>`_ on continuous
+`tutorial <https://botorch.org/docs/tutorials/multi_fidelity_bo>`_ on continuous
 multi-fidelity Bayesian Optimization.
 """
 
@@ -29,7 +29,7 @@ class AffineFidelityCostModel(DeterministicModel):
         cost = fixed_cost + sum_j weights[j] * X[fidelity_dims[j]]
 
     For a full worked example, see the
-    `tutorial <https://botorch.org/tutorials/multi_fidelity_bo>`_ on continuous
+    `tutorial <https://botorch.org/docs/tutorials/multi_fidelity_bo>`_ on continuous
     multi-fidelity Bayesian Optimization.
 
     Example:
diff --git a/botorch/models/gp_regression_fidelity.py b/botorch/models/gp_regression_fidelity.py
index add63558c7..cc583a7953 100644
--- a/botorch/models/gp_regression_fidelity.py
+++ b/botorch/models/gp_regression_fidelity.py
@@ -8,7 +8,7 @@
 Multi-Fidelity Gaussian Process Regression models based on GPyTorch models.
 
 For more on Multi-Fidelity BO, see the
-`tutorial <https://botorch.org/tutorials/discrete_multi_fidelity_bo>`__.
+`tutorial <https://botorch.org/docs/tutorials/discrete_multi_fidelity_bo>`__.
 
 A common use case of multi-fidelity regression modeling is optimizing a
 "high-fidelity" function that is expensive to simulate when you have access to
diff --git a/botorch/models/transforms/input.py b/botorch/models/transforms/input.py
index 74873f3736..48f31d9eaf 100644
--- a/botorch/models/transforms/input.py
+++ b/botorch/models/transforms/input.py
@@ -1242,7 +1242,7 @@ class AppendFeatures(InputTransform):
     `RiskMeasureMCObjective` to optimize risk measures as described in
     [Cakmak2020risk]_. A tutorial notebook implementing the rhoKG acqusition
     function introduced in [Cakmak2020risk]_ can be found at
-    https://botorch.org/tutorials/risk_averse_bo_with_environmental_variables.
+    https://botorch.org/docs/tutorials/risk_averse_bo_with_environmental_variables.
 
     The steps for using this to obtain samples of a risk measure are as follows:
 
@@ -1505,7 +1505,7 @@ class InputPerturbation(InputTransform):
     on optimizing risk measures.
 
     A tutorial notebook using this with `qNoisyExpectedImprovement` can be found at
-    https://botorch.org/tutorials/risk_averse_bo_with_input_perturbations.
+    https://botorch.org/docs/tutorials/risk_averse_bo_with_input_perturbations.
     """
 
     is_one_to_many: bool = True
diff --git a/docs/acquisition.md b/docs/acquisition.md
index aab06a4d4f..f88aa2bbe4 100644
--- a/docs/acquisition.md
+++ b/docs/acquisition.md
@@ -9,7 +9,7 @@ black box function.
 
 BoTorch supports both analytic as well as (quasi-) Monte-Carlo based acquisition
 functions. It provides a generic
-[`AcquisitionFunction`](../api/acquisition.html#acquisitionfunction) API that
+[`AcquisitionFunction`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.acquisition.AcquisitionFunction) API that
 abstracts away from the particular type, so that optimization can be performed
 on the same objects.
 
@@ -64,7 +64,7 @@ where $\mu(X)$ is the posterior mean of $f$ at $X$, and $L(X)L(X)^T = \Sigma(X)$
 is a root decomposition of the posterior covariance matrix.
 
 All MC-based acquisition functions in BoTorch are derived from
-[`MCAcquisitionFunction`](../api/acquisition.html#mcacquisitionfunction).
+[`MCAcquisitionFunction`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.monte_carlo.MCAcquisitionFunction).
 
 Acquisition functions expect input tensors $X$ of shape
 $\textit{batch\_shape} \times q \times d$, where $d$ is the dimension of the
@@ -122,15 +122,15 @@ above.
 
 BoTorch also provides implementations of analytic acquisition functions that
 do not depend on MC sampling. These acquisition functions are subclasses of
-[`AnalyticAcquisitionFunction`](../api/acquisition.html#analyticacquisitionfunction)
+[`AnalyticAcquisitionFunction`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.analytic.AnalyticAcquisitionFunction)
 and only exist for the case of a single candidate point ($q = 1$). These
 include classical acquisition functions such as Expected Improvement (EI),
 Upper Confidence Bound (UCB), and Probability of Improvement (PI). An example
-comparing [`ExpectedImprovement`](../api/acquisition.html#expectedimprovement),
+comparing [`ExpectedImprovement`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.analytic.ExpectedImprovement),
 the analytic version of EI, to it's MC counterpart
-[`qExpectedImprovement`](../api/acquisition.html#qexpectedimprovement)
+[`qExpectedImprovement`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.monte_carlo.qExpectedImprovement)
 can be found in
-[this tutorial](../tutorials/compare_mc_analytic_acquisition).
+[this tutorial](tutorials/compare_mc_analytic_acquisition).
 
 Analytic acquisition functions allow for an explicit expression in terms of the
 summary statistics of the posterior distribution at the evaluated point(s).
diff --git a/docs/batching.md b/docs/batching.md
index 6c4c92dd27..c169f8984e 100644
--- a/docs/batching.md
+++ b/docs/batching.md
@@ -19,7 +19,7 @@ referred to as q-Acquisition Functions. For instance, BoTorch ships with support
 for q-EI, q-UCB, and a few others.
 
 As discussed in the
-[design philosophy](design_philosophy#batching-batching-batching),
+[design philosophy](/docs/design_philosophy#parallelism-through-batched-computations),
 BoTorch has adopted the convention of referring to batches in the
 batch-acquisition sense as "q-batches", and to batches in the torch
 batch-evaluation sense as "t-batches".
@@ -35,9 +35,9 @@ with samples from the posterior in a consistent fashion.
 
 #### Batch-Mode Decorator
 
-In order to simplify the user-facing API for evaluating acquisition functions,  
+In order to simplify the user-facing API for evaluating acquisition functions,
 BoTorch implements the
-[`@t_batch_mode_transform`](../api/utils.html#botorch.utils.transforms.t_batch_mode_transform)
+[`@t_batch_mode_transform`](https://botorch.readthedocs.io/en/latest/utils.html#botorch.utils.transforms.t_batch_mode_transform)
 decorator, which allows the use of non-batch mode inputs. If applied to an
 instance method with a single `Tensor` argument, an input tensor to that method
 without a t-batch dimension (i.e. tensors of shape $q \times d$) will automatically
@@ -66,7 +66,7 @@ distribution:
   of $b_1 \times \cdots \times b_k$, with $n$ data points of $d$-dimensions each in every batch)
   yields a posterior with `event_shape` being $b_1 \times \cdots \times b_k \times n \times 1$.
   In most cases, the t-batch-shape will be single-dimensional (i.e., $k=1$).
-- Evaluating a multi-output model with $o$ outputs at a $b_1 \times \cdots \times b_k   
+- Evaluating a multi-output model with $o$ outputs at a $b_1 \times \cdots \times b_k
   \times n \times d$ tensor yields a posterior with `event_shape` equal to
   $b_1 \times \cdots \times b_k \times n \times o$.
 - Recall from the previous section that internally, with the help of the
@@ -123,7 +123,7 @@ The shape of the test points must support broadcasting to the $\textit{batch_sha
   necessary over $\textit{batch_shape}$)
 
 #### Batched Multi-Output Models
-The [`BatchedMultiOutputGPyTorchModel`](../api/models.html#batchedmultioutputgpytorchmodel)
+The [`BatchedMultiOutputGPyTorchModel`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.gpytorch.BatchedMultiOutputGPyTorchModel)
 class implements a fast multi-output model (assuming conditional independence of
 the outputs given the input) by batching over the outputs.
 
@@ -157,5 +157,5 @@ back-propagating.
 
 #### Batched Cross Validation
 See the
-[Using batch evaluation for fast cross validation](../tutorials/batch_mode_cross_validation)
+[Using batch evaluation for fast cross validation](tutorials/batch_mode_cross_validation)
 tutorial for details on using batching for fast cross validation.
diff --git a/docs/botorch_and_ax.md b/docs/botorch_and_ax.md
index 775fbdff22..a2a690523e 100644
--- a/docs/botorch_and_ax.md
+++ b/docs/botorch_and_ax.md
@@ -18,7 +18,7 @@ it easy to drive the car.
 
 
 Ax provides a
-[`BotorchModel`](https://ax.dev/api/models.html#ax.models.torch.botorch.BotorchModel)
+[`BotorchModel`](https://https://ax.readthedocs.io/en/latest/models.html#ax.models.torch.botorch.BotorchModel)
 that is a sensible default for modeling and optimization which can be customized
 by specifying and passing in bespoke model constructors, acquisition functions,
 and optimization strategies.
@@ -43,7 +43,7 @@ the the Bayesian Optimization loop untouched. It is then straightforward to plug
 your custom BoTorch model or acquisition function into Ax to take advantage of
 Ax's various loop control APIs, as well as its powerful automated metadata
 management, data storage, etc. See the
-[Using a custom BoTorch model in Ax](../tutorials/custom_botorch_model_in_ax)
+[Using a custom BoTorch model in Ax](tutorials/custom_botorch_model_in_ax)
 tutorial for more on how to do this.
 
 
@@ -53,8 +53,8 @@ If you're working in a non-standard setting, such as structured feature or
 design spaces, or where the model fitting process requires interactive work,
 then using Ax may not be the best solution for you. In such a situation, you
 might be better off writing your own full Bayesian Optimization loop in BoTorch.
-The [q-Noisy Constrained EI](../tutorials/closed_loop_botorch_only) tutorial and
-[variational auto-encoder](../tutorials/vae_mnist) tutorial give examples of how
+The [q-Noisy Constrained EI](tutorials/closed_loop_botorch_only) tutorial and
+[variational auto-encoder](tutorials/vae_mnist) tutorial give examples of how
 this can be done.
 
 You may also consider working purely in BoTorch if you want to be able to
diff --git a/docs/constraints.md b/docs/constraints.md
index 216eb1ba5c..3103a111fd 100644
--- a/docs/constraints.md
+++ b/docs/constraints.md
@@ -41,7 +41,7 @@ the constrained expected improvement variant is mathematically equivalent to the
 unconstrained expected improvement of the objective, multiplied by the probability of
 feasibility under the modeled outcome constraint.
 
-See the [Closed-Loop Optimization](../tutorials/closed_loop_botorch_only)
+See the [Closed-Loop Optimization](tutorials/closed_loop_botorch_only)
 tutorial for an example of using outcome constraints in BoTorch.
 
 
diff --git a/docs/design_philosophy.md b/docs/design_philosophy.md
index 5367760e85..ac65385901 100644
--- a/docs/design_philosophy.md
+++ b/docs/design_philosophy.md
@@ -69,7 +69,7 @@ all data available. In typical machine learning model training, a stochastic
 version of the empirical loss, obtained by "mini-batching" the data, is
 optimized using stochastic optimization algorithms.
 
-In BoTorch, [`AcquisitionFunction`](../api/acquisition.html#acquisitionfunction)
+In BoTorch, [`AcquisitionFunction`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.acquisition.AcquisitionFunction)
 modules map an input design $X$ to the acquisition function value. Optimizing
 the acquisition function means optimizing the output over the possible values of
 $X$. If the acquisition function is deterministic, then so is the optimization
diff --git a/docs/getting_started.mdx b/docs/getting_started.mdx
index 0b2f9b73f8..d5d10b6836 100644
--- a/docs/getting_started.mdx
+++ b/docs/getting_started.mdx
@@ -89,13 +89,13 @@ Here's a quick run down of the main components of a Bayesian Optimization loop.
 ## Tutorials
 
 Our Jupyter notebook tutorials help you get off the ground with BoTorch.
-View and download them [here](../tutorials).
+View and download them [here](tutorials).
 
 
 ## API Reference
 
 For an in-depth reference of the various BoTorch internals, see our
-[API Reference](../api).
+[API Reference](https://botorch.readthedocs.io/).
 
 
 ## Contributing
diff --git a/docs/models.md b/docs/models.md
index cec82f5c46..37e284b48c 100644
--- a/docs/models.md
+++ b/docs/models.md
@@ -13,15 +13,15 @@ the posterior distribution is a multivariate normal. While BoTorch supports many
 GP models, **BoTorch makes no assumption on the model being a GP** or the
 posterior being multivariate normal. With the exception of some of the analytic
 acquisition functions in the
-[`botorch.acquisition.analytic`](../api/acquisition.html#analytic-acquisition-function-api)
+[`botorch.acquisition.analytic`](https://botorch.readthedocs.io/en/latest/acquisition.html#analytic-acquisition-function-api)
 module, BoTorch’s Monte Carlo-based acquisition functions are compatible with
 any model that conforms to the `Model` interface, whether user-implemented or
 provided.
 
 Under the hood, BoTorch models are PyTorch `Modules` that implement the
-light-weight [`Model`](../api/models.html#model-apis) interface. When working
+light-weight [`Model`](https://botorch.readthedocs.io/en/latest/models.html#model-apis) interface. When working
 with GPs,
-[`GPyTorchModel`](../api/models.html#module-botorch.models.gp_regression)
+[`GPyTorchModel`](https://botorch.readthedocs.io/en/latest/models.html#module-botorch.models.gp_regression)
 provides a base class for conveniently wrapping GPyTorch models.
 
 Users can extend `Model` and `GPyTorchModel` to generate their own models. For
@@ -84,36 +84,36 @@ BoTorch provides several GPyTorch models to cover most standard BO use cases:
 These models use the same training data for all outputs and assume conditional
 independence of the outputs given the input. If different training data is
 required for each output, use a
-[`ModelListGP`](../api/models.html#module-botorch.models.model_list_gp_regression)
+[`ModelListGP`](https://botorch.readthedocs.io/en/latest/models.html#module-botorch.models.model_list_gp_regression)
 instead.
 
-- [`SingleTaskGP`](../api/models.html#botorch.models.gp_regression.SingleTaskGP):
+- [`SingleTaskGP`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.gp_regression.SingleTaskGP):
   a single-task exact GP that supports both inferred and observed noise. When
   noise observations are not provided, it infers a homoskedastic noise level.
-- [`MixedSingleTaskGP`](../api/models.html#botorch.models.gp_regression_mixed.MixedSingleTaskGP):
+- [`MixedSingleTaskGP`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.gp_regression_mixed.MixedSingleTaskGP):
   a single-task exact GP that supports mixed search spaces, which combine
   discrete and continuous features.
-- [`SaasFullyBayesianSingleTaskGP`](../api/models.html#botorch.models.fully_bayesian.SaasFullyBayesianSingleTaskGP):
+- [`SaasFullyBayesianSingleTaskGP`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.fully_bayesian.SaasFullyBayesianSingleTaskGP):
   a fully Bayesian single-task GP with the SAAS prior. This model is suitable
   for sample-efficient high-dimensional Bayesian optimization.
 
 ### Model List of Single-Task GPs
 
-- [`ModelListGP`](../api/models.html#module-botorch.models.model_list_gp_regression):
+- [`ModelListGP`](https://botorch.readthedocs.io/en/latest/models.html#module-botorch.models.model_list_gp_regression):
   A multi-output model in which outcomes are modeled independently, given a list
   of any type of single-task GP. This model should be used when the same
   training data is not used for all outputs.
 
 ### Multi-Task GPs
 
-- [`MultiTaskGP`](../api/models.html#module-botorch.models.multitask): a
+- [`MultiTaskGP`](https://botorch.readthedocs.io/en/latest/models.html#module-botorch.models.multitask): a
   Hadamard multi-task, multi-output GP using an ICM kernel. Supports both known
   observation noise levels and inferring a homoskedastic noise level (when noise
   observations are not provided).
-- [`KroneckerMultiTaskGP`](../api/models.html#botorch.models.multitask.KroneckerMultiTaskGP):
+- [`KroneckerMultiTaskGP`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.multitask.KroneckerMultiTaskGP):
   A multi-task, multi-output GP using an ICM kernel, with Kronecker structure.
   Useful for multi-fidelity optimization.
-- [`SaasFullyBayesianMultiTaskGP`](../api/models.html#saasfullybayesianmultitaskgp):
+- [`SaasFullyBayesianMultiTaskGP`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.fully_bayesian_multitask.SaasFullyBayesianMultiTaskGP):
   a fully Bayesian multi-task GP using an ICM kernel. The data kernel uses the
   SAAS prior to model high-dimensional parameter spaces.
 
@@ -128,33 +128,33 @@ additional context on the default hyperparameters.
 
 ## Other useful models
 
-- [`ModelList`](../api/models.html#botorch.models.model.ModelList): a
+- [`ModelList`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.model.ModelList): a
   multi-output model container in which outcomes are modeled independently by
   individual `Model`s (as in `ModelListGP`, but the component models do not all
   need to be GPyTorch models).
-- [`SingleTaskMultiFidelityGP`](../api/models.html#botorch.models.gp_regression_fidelity.SingleTaskMultiFidelityGP):
+- [`SingleTaskMultiFidelityGP`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.gp_regression_fidelity.SingleTaskMultiFidelityGP):
   A GP model for multi-fidelity optimization. For more on Multi-Fidelity BO, see
-  the [tutorial](../tutorials/discrete_multi_fidelity_bo).
-- [`HigherOrderGP`](../api/models.html#botorch.models.higher_order_gp.HigherOrderGP):
+  the [tutorial](tutorials/discrete_multi_fidelity_bo).
+- [`HigherOrderGP`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.higher_order_gp.HigherOrderGP):
   A GP model with matrix-valued predictions, such as images or grids of images.
-- [`PairwiseGP`](../api/models.html#module-botorch.models.pairwise_gp): A
+- [`PairwiseGP`](https://botorch.readthedocs.io/en/latest/models.html#module-botorch.models.pairwise_gp): A
   probit-likelihood GP that learns via pairwise comparison data, useful for
   preference learning.
-- [`ApproximateGPyTorchModel`](../api/models.html#botorch.models.approximate_gp.ApproximateGPyTorchModel):
+- [`ApproximateGPyTorchModel`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.approximate_gp.ApproximateGPyTorchModel):
   for efficient computation when data is large or responses are non-Gaussian.
-- [Deterministic models](../api/models.html#module-botorch.models.deterministic),
+- [Deterministic models](https://botorch.readthedocs.io/en/latest/models.html#module-botorch.models.deterministic),
   such as
-  [`AffineDeterministicModel`](../api/models.html#botorch.models.deterministic.AffineDeterministicModel),
-  [`AffineFidelityCostModel`](../api/models.html#botorch.models.cost.AffineFidelityCostModel),
-  [`GenericDeterministicModel`](../api/models.html#botorch.models.deterministic.GenericDeterministicModel),
+  [`AffineDeterministicModel`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.deterministic.AffineDeterministicModel),
+  [`AffineFidelityCostModel`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.cost.AffineFidelityCostModel),
+  [`GenericDeterministicModel`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.deterministic.GenericDeterministicModel),
   and
-  [`PosteriorMeanModel`](../api/models.html#botorch.models.deterministic.PosteriorMeanModel)
+  [`PosteriorMeanModel`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.deterministic.PosteriorMeanModel)
   express known input-output relationships; they conform to the BoTorch `Model`
   API, so they can easily be used in conjunction with other BoTorch models.
   Deterministic models are useful for multi-objective optimization with known
   objective functions and for encoding cost functions for cost-aware
   acquisition.
-- [`SingleTaskVariationalGP`](../api/models.html#botorch.models.approximate_gp.SingleTaskVariationalGP):
+- [`SingleTaskVariationalGP`](https://botorch.readthedocs.io/en/latest/models.html#botorch.models.approximate_gp.SingleTaskVariationalGP):
   an approximate model for faster computation when you have a lot of data or
   your responses are non-Gaussian.
 
@@ -169,7 +169,7 @@ configurable model class whose implementation is difficult to understand.
 Instead, we advocate that users implement their own models to cover more
 specialized use cases. The light-weight nature of BoTorch's Model API makes this
 easy to do. See the
-[Using a custom BoTorch model in Ax](../tutorials/custom_botorch_model_in_ax)
+[Using a custom BoTorch model in Ax](tutorials/custom_botorch_model_in_ax)
 tutorial for an example.
 
 The BoTorch `Model` interface is light-weight and easy to extend. The only
diff --git a/docs/multi_objective.md b/docs/multi_objective.md
index 758dec4311..f19b985610 100644
--- a/docs/multi_objective.md
+++ b/docs/multi_objective.md
@@ -5,13 +5,13 @@ title: Multi-Objective Bayesian Optimization
 
 BoTorch provides first-class support for Multi-Objective (MO) Bayesian
 Optimization (BO) including implementations of
-[`qLogNoisyExpectedHypervolumeImprovement`](../api/acquisition.html#botorch.acquisition.multi_objective.logei.qLogNoisyExpectedHypervolumeImprovement)
+[`qLogNoisyExpectedHypervolumeImprovement`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.multi_objective.logei.qLogNoisyExpectedHypervolumeImprovement)
 (qLogNEHVI)[^qNEHVI][^LogEI],
-[`qLogExpectedHypervolumeImprovement`](../api/acquisition.html#botorch.acquisition.multi_objective.logei.qLogExpectedHypervolumeImprovement)
+[`qLogExpectedHypervolumeImprovement`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.multi_objective.logei.qLogExpectedHypervolumeImprovement)
 (qLogEHVI),
-[`qLogNParEGO`](../api/acquisition.html#botorch.acquisition.multi_objective.parego.qLogNParEGO)[^qNEHVI],
+[`qLogNParEGO`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.multi_objective.parego.qLogNParEGO)[^qNEHVI],
 and analytic
-[`ExpectedHypervolumeImprovement`](../api/acquisition.html#botorch.acquisition.multi_objective.analytic.ExpectedHypervolumeImprovement)
+[`ExpectedHypervolumeImprovement`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.multi_objective.analytic.ExpectedHypervolumeImprovement)
 (EHVI) with gradients via auto-differentiation acquisition functions[^qEHVI].
 
 The goal in MOBO is learn the _Pareto front_: the set of optimal trade-offs,
@@ -36,9 +36,9 @@ acquisition functions support using the sample average approximation for rapid
 convergence [^BoTorch].
 
 All analytic MO acquisition functions derive from
-[`MultiObjectiveAnalyticAcquisitionFunction`](../api/acquisition.html#botorch.acquisition.multi_objective.base.MultiObjectiveAnalyticAcquisitionFunction)
+[`MultiObjectiveAnalyticAcquisitionFunction`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.multi_objective.base.MultiObjectiveAnalyticAcquisitionFunction)
 and all MC-based acquisition functions derive from
-[`MultiObjectiveMCAcquisitionFunction`](../api/acquisition.html#botorch.acquisition.multi_objective.base.MultiObjectiveMCAcquisitionFunction).
+[`MultiObjectiveMCAcquisitionFunction`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.multi_objective.base.MultiObjectiveMCAcquisitionFunction).
 These abstract classes easily integrate with BoTorch's standard optimization
 machinery.
 
@@ -46,22 +46,22 @@ machinery.
 setting, it uses a new random scalarization for each candidate [^qEHVI].
 Candidates are selected in a sequential greedy fashion, each with a different
 scalarization, via the
-[`optimize_acqf_list`](../api/optim.html#botorch.optim.optimize.optimize_acqf_list)
+[`optimize_acqf_list`](https://botorch.readthedocs.io/en/latest/optim.html#botorch.optim.optimize.optimize_acqf_list)
 function.
 
 For a more in-depth example using these acquisition functions, check out the
-[Multi-Objective Bayesian Optimization tutorial notebook](../tutorials/multi_objective_bo).
+[Multi-Objective Bayesian Optimization tutorial notebook](tutorials/multi_objective_bo).
 
 ## Multi-Objective Utilities
 
 BoTorch provides several utility functions for evaluating performance in MOBO
 including a method for computing the Pareto front
-[`is_non_dominated`](../api/utils.html#botorch.utils.multi_objective.pareto.is_non_dominated)
+[`is_non_dominated`](https://botorch.readthedocs.io/en/latest/utils.html#botorch.utils.multi_objective.pareto.is_non_dominated)
 and efficient box decomposition algorithms for efficiently partitioning the the
 space dominated
-[`DominatedPartitioning`](../api/utils.html#botorch.utils.multi_objective.box_decompositions.dominated.DominatedPartitioning)
+[`DominatedPartitioning`](https://botorch.readthedocs.io/en/latest/utils.html#botorch.utils.multi_objective.box_decompositions.dominated.DominatedPartitioning)
 or non-dominated
-[`NonDominatedPartitioning`](../api/utils.html#botorch.utils.multi_objective.box_decompositions.non_dominated.NondominatedPartitioning)
+[`NonDominatedPartitioning`](https://botorch.readthedocs.io/en/latest/utils.html#botorch.utils.multi_objective.box_decompositions.non_dominated.NondominatedPartitioning)
 by the Pareto frontier into axis-aligned hyperrectangular boxes. For exact box
 decompositions, BoTorch uses a two-step approach similar to that in [^Yang2019],
 where (1) Algorithm 1 from [Lacour17]_ is used to find the local lower bounds
@@ -76,9 +76,9 @@ also be used to efficiently compute hypervolumes.
 Additionally, variations on ParEGO can be trivially implemented using an
 augmented Chebyshev scalarization as the objective with an EI-type
 single-objective acquisition function such as
-[`qLogNoisyExpectedImprovement`](../api/acquisition.html#botorch.acquisition.logei.qLogNoisyExpectedImprovement).
+[`qLogNoisyExpectedImprovement`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.logei.qLogNoisyExpectedImprovement).
 The
-[`get_chebyshev_scalarization`](../api/utils.html#botorch.utils.multi_objective.scalarization.get_chebyshev_scalarization)
+[`get_chebyshev_scalarization`](https://botorch.readthedocs.io/en/latest/utils.html#botorch.utils.multi_objective.scalarization.get_chebyshev_scalarization)
 convenience function generates these scalarizations.
 
 [^qNEHVI]:
@@ -93,9 +93,9 @@ convenience function generates these scalarizations.
     Neural Information Processing Systems 36, 2023.
     [paper](https://arxiv.org/abs/2310.20708) "Log" variances of acquisition
     functions, such as
-    [`qLogNoisyExpectedHypervolumeImprovement`](../api/acquisition.html#botorch.acquisition.multi_objective.logei.qLogNoisyExpectedHypervolumeImprovement),
+    [`qLogNoisyExpectedHypervolumeImprovement`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.multi_objective.logei.qLogNoisyExpectedHypervolumeImprovement),
     offer improved numerics compared to older counterparts such as
-    [`qNoisyExpectedHypervolumeImprovement`](../api/acquisition.html#botorch.acquisition.multi_objective.monte_carlo.qNoisyExpectedHypervolumeImprovement).
+    [`qNoisyExpectedHypervolumeImprovement`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.multi_objective.monte_carlo.qNoisyExpectedHypervolumeImprovement).
 
 [^qEHVI]:
     S. Daulton, M. Balandat, and E. Bakshy. Differentiable Expected Hypervolume
diff --git a/docs/objectives.md b/docs/objectives.md
index 961993eeaf..427c81aed7 100644
--- a/docs/objectives.md
+++ b/docs/objectives.md
@@ -23,11 +23,11 @@ relative impunity so long gradients can be back-propagated through the
 transformation.
 
 All BoTorch objectives are derived from
-[`MCAcquisitionObjective`](../api/acquisition.html#mcacquisitionobjective).
+[`MCAcquisitionObjective`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.objective.MCAcquisitionObjective).
 BoTorch implements several MC-based objectives, including
-[`LinearMCObjective`](../api/acquisition.html#linearmcobjective) for linear
+[`LinearMCObjective`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.objective.LinearMCObjective) for linear
 combinations of model outputs, and
-[`ConstrainedMCObjective`](../api/acquisition.html#constrainedmcobjective) for
+[`ConstrainedMCObjective`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.objective.ConstrainedMCObjective) for
 constrained objectives (using a sigmoid approximation for the constraints).
 
 
@@ -35,7 +35,7 @@ constrained objectives (using a sigmoid approximation for the constraints).
 
 ### Utilizing GenericMCObjective
 
-The [`GenericMCObjective`](../api/acquisition.html#genericmcobjective) allows
+The [`GenericMCObjective`](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.objective.GenericMCObjective) allows
 simply using a generic callable to implement an ad-hoc objective. The callable
 is expected to map a `sample_shape x batch_shape x q x o`-dimensional tensor of
 posterior samples and an (optional) `batch_shape x q x d`-dimensional tensor of
diff --git a/docs/optimization.md b/docs/optimization.md
index f033af7f86..458bf34cf8 100644
--- a/docs/optimization.md
+++ b/docs/optimization.md
@@ -6,7 +6,7 @@ title: Optimization
 ## Model Fitting
 
 BoTorch provides the convenience method
-[`fit_gpytorch_mll()`](../api/fit.html#botorch.fit.fit_gpytorch_mll) for
+[`fit_gpytorch_mll()`](https://botorch.readthedocs.io/en/latest/fit.html#botorch.fit.fit_gpytorch_mll) for
 fitting GPyTorch models (optimizing model hyperparameters) using L-BFGS-B via
 `scipy.optimize.minimize()`. We recommend using this method for exact GPs, but
 other optimizers may be necessary for models with thousands of parameters or
@@ -17,7 +17,7 @@ observations.
 #### Using scipy Optimizers on Tensors
 
 The default method used by BoTorch to optimize acquisition functions is
-[`gen_candidates_scipy()`](../api/generation.html#botorch.generation.gen.gen_candidates_scipy).
+[`gen_candidates_scipy()`](https://botorch.readthedocs.io/en/latest/generation.html#botorch.generation.gen.gen_candidates_scipy).
 Given a set of starting points (for multiple restarts) and an acquisition
 function, this optimizer makes use of `scipy.optimize.minimize()` for
 optimization, via either the L-BFGS-B or SLSQP routines.
@@ -32,11 +32,11 @@ used directly, without the need to perform `numpy` conversion. These first-order
 gradient-based optimizers are particularly useful for the case when the
 acquisition function is stochastic, where algorithms like L-BFGS or SLSQP that
 are designed for deterministic functions should not be applied. The function
-[`gen_candidates_torch()`](../api/generation.html#botorch.generation.gen.gen_candidates_torch)
+[`gen_candidates_torch()`](https://botorch.readthedocs.io/en/latest/generation.html#botorch.generation.gen.gen_candidates_torch)
 provides an interface for `torch` optimizers and handles bounding.
 See the example notebooks
-[here](../tutorials/compare_mc_analytic_acquisition) and
-[here](../tutorials/optimize_stochastic) for tutorials on how to use different
+[here](tutorials/compare_mc_analytic_acquisition) and
+[here](tutorials/optimize_stochastic) for tutorials on how to use different
 optimizers.
 
 
@@ -45,10 +45,10 @@ optimizers.
 Acquisition functions are often difficult to optimize as they are generally
 non-convex and can exhibit numerically vanishing gradients, a problem that is
 particularly prominent in naive formulations of Expected Improvement (EI).
-[LogEI](../api/acquisition.html#botorch.acquisition.analytic.LogExpectedImprovement)
+[LogEI](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.analytic.LogExpectedImprovement)
 and its siblings
-([qLogNEI](../api/acquisition.html#botorch.acquisition.logei.qLogNoisyExpectedImprovement) and
-[qLogNEHVI](../api/acquisition.html#botorch.acquisition.multi_objective.logei.qLogNoisyExpectedHypervolumeImprovement),
+([qLogNEI](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.logei.qLogNoisyExpectedImprovement) and
+[qLogNEHVI](https://botorch.readthedocs.io/en/latest/acquisition.html#botorch.acquisition.multi_objective.logei.qLogNoisyExpectedHypervolumeImprovement),
 ...) ameliorate the flatness issue
 and generally lead to signficiantly higher optimization performance [^Ament2023].
 Since convexity cannot be guaranteed in general, BoTorch makes use of
@@ -56,7 +56,7 @@ multiple random restarts to improve optimization quality. Each restart gives ris
 a separate optimization within a particular local region; thus,
 the best result over many restarts can provide an approximation to the
 global optimization objective. The function
-[`gen_batch_initial_conditions()`](../api/optim.html#botorch.optim.optimize.gen_batch_initial_conditions), which is used by default,
+[`gen_batch_initial_conditions()`](https://botorch.readthedocs.io/en/latest/optim.html#botorch.optim.initializers.gen_batch_initial_conditions), which is used by default,
 implements heuristics for choosing a set of initial restart locations (candidates).
 
 Rather than optimize sequentially from each initial restart
@@ -65,9 +65,9 @@ evaluation (t-batches) of the acquisition function to solve a single
 $b \times q \times d$-dimensional optimization problem, where the objective is
 defined as the sum of the $b$ individual q-batch acquisition values.
 The wrapper function
-[`optimize_acqf()`](../api/optim.html#botorch.optim.optimize.optimize_acqf)
+[`optimize_acqf()`](https://botorch.readthedocs.io/en/latest/optim.html#botorch.optim.optimize.optimize_acqf)
 uses
-[`get_best_candidates()`](../api/generation.html#botorch.generation.gen.get_best_candidates)
+[`get_best_candidates()`](https://botorch.readthedocs.io/en/latest/generation.html#botorch.generation.gen.get_best_candidates)
 to process the output of `gen_candidates_scipy()` and return the best point
 found over the random restarts. For reasonable values of $b$ and $q$, jointly
 optimizing over random restarts can significantly reduce wall time by exploiting
@@ -83,7 +83,7 @@ an optimization problem with a $q \times d$-dimensional decision), but when $q$
 is large, one might also consider *sequentially* selecting the $q$ points using
 successive conditioning on so-called "fantasies", and solving $q$ optimization
 problems, each with a $d$-dimensional decision. The functions
-[`optimize_acqf()`](../api/optim.html#botorch.optim.optimize.optimize_acqf)
+[`optimize_acqf()`](https://botorch.readthedocs.io/en/latest/optim.html#botorch.optim.optimize.optimize_acqf)
 by default performs joint optimization; when specifying `sequential=True` it
 will perform sequential optimization.
 
diff --git a/docs/posteriors.md b/docs/posteriors.md
index b3bf881f54..d13cf256cf 100644
--- a/docs/posteriors.md
+++ b/docs/posteriors.md
@@ -16,11 +16,11 @@ While the analytic acquisition functions assume that the posterior is a
 multivariate Gaussian, the Monte-Carlo (MC) based acquisition functions do not make any
 assumptions about the underlying distribution. Rather, the MC-based acquisition
 functions only require that the posterior can generate samples through an `rsample`
-method. As long as the posterior implements the [`Posterior`](../api/posteriors.html#posterior)
+method. As long as the posterior implements the [`Posterior`](https://botorch.readthedocs.io/en/latest/posteriors.html#botorch.posteriors.posterior.Posterior)
 interface, it can be used with an MC-based acquisition function. In addition, note that
 gradient-based acquisition function optimization requires the ability to back-propagate
 gradients through the MC samples.
 
 For GP models based on GPyTorch for which the posterior distribution is a
 multivariate Gaussian,
-[`GPyTorchPosterior`](../api/posteriors.html#gpytorchposterior) should be used.
+[`GPyTorchPosterior`](https://botorch.readthedocs.io/en/latest/posteriors.html#botorch.posteriors.gpytorch.GPyTorchPosterior) should be used.
diff --git a/docs/tutorials/index.mdx b/docs/tutorials/index.mdx
index 1990639a2b..3fdc14d0f6 100644
--- a/docs/tutorials/index.mdx
+++ b/docs/tutorials/index.mdx
@@ -24,7 +24,7 @@ experimentation, in order to simplify the management of your BO
 loop. Doing so can help you focus on the main aspects of BO
 (models, acquisition functions, optimization of acquisition
 functions), rather than tedious loop control. See our
-[Documentation]("https://botorch.org/docs/botorch_and_ax)
+[Documentation](/docs/botorch_and_ax)
 for additional information.
 
 <h4>Full Optimization Loops</h4>
@@ -38,15 +38,15 @@ Rather than guiding you through full end-to-end BO loops, the
 tutorials in this section focus on specific tasks that you will
 encounter in customizing your BO algorithms. For instance, you
 may want to
-[write a custom acquisition function](https://botorch.org/tutorials/custom_acquisition)
+[write a custom acquisition function](/docs/tutorials/custom_acquisition)
 and then
-[use a custom zero-th order optimizer](https://botorch.org/tutorials/optimize_with_cmaes)
+[use a custom zero-th order optimizer](/docs/tutorials/optimize_with_cmaes)
 to optimize it.
 
 <h4>Advanced Usage</h4>
 Tutorials in this section showcase more advanced ways of using
 BoTorch. For instance,
-[this tutorial](https://botorch.org/tutorials/vae_mnist)
+[this tutorial](/docs/tutorials/vae_mnist)
 shows how to perform BO if your objective function is an image,
 by optimizing in the latent space of a variational auto-encoder
 (VAE).
diff --git a/tutorials/GIBBON_for_efficient_batch_entropy_search/GIBBON_for_efficient_batch_entropy_search.ipynb b/tutorials/GIBBON_for_efficient_batch_entropy_search/GIBBON_for_efficient_batch_entropy_search.ipynb
index f360d22f1b..e28a0ebb66 100644
--- a/tutorials/GIBBON_for_efficient_batch_entropy_search/GIBBON_for_efficient_batch_entropy_search.ipynb
+++ b/tutorials/GIBBON_for_efficient_batch_entropy_search/GIBBON_for_efficient_batch_entropy_search.ipynb
@@ -8,7 +8,7 @@
       "source": [
         "## The GIBBON (General-purpose Information-Based Bayesian OptimisatioN) acquisition function\n",
         "\n",
-        "A particularly intuitive and empirically effective class of acquisition functions has arisen based on information theory. Information-theoretic Bayesian Optimisation (BO) seeks to reduce uncertainty in the location of high-performing areas of the search space, as measured in terms of differential entropy. BoTorch already supports information-theoretic BO through an implementation of the Max-value Entropy Search (MES) acquisition function [1] (see the [Max-Value Entropy tutorial](./max_value_entropy) for details), which makes evaluations that reduce uncertainty in the maximum value attained by the objective function. However, in order to support batch and multi-fidelity BO, our implementation of MES employs numerical integrations and fantasy observations  (i. e., we generate one point each time and when we try to generate the 𝑖-th point of a batch, we condition the models on the 𝑖−1 points generated prior to this). Unfortunately,  Each of these calculations can can add significantly to the computational overhead incurred by BO.\n",
+        "A particularly intuitive and empirically effective class of acquisition functions has arisen based on information theory. Information-theoretic Bayesian Optimisation (BO) seeks to reduce uncertainty in the location of high-performing areas of the search space, as measured in terms of differential entropy. BoTorch already supports information-theoretic BO through an implementation of the Max-value Entropy Search (MES) acquisition function [1] (see the [Max-Value Entropy tutorial](/docs/tutorials/max_value_entropy) for details), which makes evaluations that reduce uncertainty in the maximum value attained by the objective function. However, in order to support batch and multi-fidelity BO, our implementation of MES employs numerical integrations and fantasy observations  (i. e., we generate one point each time and when we try to generate the 𝑖-th point of a batch, we condition the models on the 𝑖−1 points generated prior to this). Unfortunately,  Each of these calculations can can add significantly to the computational overhead incurred by BO.\n",
         "\n",
         "In this notebook, we provide an information-theoretic acquisition function for tasks where objective function query costs are not large enough to overshadow significant optimisation overheads known as General-purpose Information-Based Bayesian OptimisatioN (GIBBON) [2]. In this tutorial, we present a very high-level overview of GIBBON and demonstrate its use within BoTorch.\n",
         "\n",
diff --git a/tutorials/baxus/baxus.ipynb b/tutorials/baxus/baxus.ipynb
index bb8e830c3b..6b2cb877cc 100644
--- a/tutorials/baxus/baxus.ipynb
+++ b/tutorials/baxus/baxus.ipynb
@@ -7,7 +7,7 @@
     "## BO with BAxUS and TS/EI\n",
     "\n",
     "In this tutorial, we show how to implement **B**ayesian optimization with **a**daptively e**x**panding s**u**bspace**s** (BAxUS) [1] in a closed loop in BoTorch.\n",
-    "The tutorial is purposefully similar to the [TuRBO tutorial](https://botorch.org/tutorials/turbo_1) to highlight the differences in the implementations.\n",
+    "The tutorial is purposefully similar to the [TuRBO tutorial](https://botorch.org/docs/tutorials/turbo_1) to highlight the differences in the implementations.\n",
     "\n",
     "This implementation supports either Expected Improvement (EI) or Thompson sampling (TS). We optimize the Branin2 function [2] with 498 dummy dimensions and show that BAxUS outperforms EI as well as Sobol.\n",
     "\n",
diff --git a/tutorials/bo_with_warped_gp/bo_with_warped_gp.ipynb b/tutorials/bo_with_warped_gp/bo_with_warped_gp.ipynb
index 349ad4f167..f273280423 100644
--- a/tutorials/bo_with_warped_gp/bo_with_warped_gp.ipynb
+++ b/tutorials/bo_with_warped_gp/bo_with_warped_gp.ipynb
@@ -8,7 +8,7 @@
         "\n",
         "In this tutorial, we illustrate how to use learned input warping functions for robust Bayesian Optimization when the outcome may be non-stationary functions. When the lengthscales are non-stationarity in the raw input space, learning a warping function that maps raw inputs to a warped space where the lengthscales are stationary can be useful, because then standard stationary kernels can be used to effectively model the function.\n",
         "\n",
-        "In general, for a relatively simple setup (like this one), we recommend using [Ax](https://ax.dev), since this will simplify your setup (including the amount of code you need to write) considerably. See the [Using BoTorch with Ax](./custom_botorch_model_in_ax) tutorial. To use input warping with `MODULAR_BOTORCH`, we can pass the `warp_tf`, constructed as below, by adding `input_transform=warp_tf` argument to the `Surrogate(...)` call. \n",
+        "In general, for a relatively simple setup (like this one), we recommend using [Ax](https://ax.dev), since this will simplify your setup (including the amount of code you need to write) considerably. See the [Using BoTorch with Ax](/docs/tutorials/custom_botorch_model_in_ax) tutorial. To use input warping with `MODULAR_BOTORCH`, we can pass the `warp_tf`, constructed as below, by adding `input_transform=warp_tf` argument to the `Surrogate(...)` call. \n",
         "\n",
         "We consider use a Kumaraswamy CDF as the class of input warping function and learn the concentration parameters ($a>0$ and $b>0$). Kumaraswamy CDFs are quite flexible and map inputs in [0, 1] to outputs in [0, 1]. This work follows the Beta CDF input warping proposed by Snoek et al., but replaces the Beta distribution Kumaraswamy distribution, which has a *differentiable* and closed-form CDF. \n",
         "   \n",
diff --git a/tutorials/closed_loop_botorch_only/closed_loop_botorch_only.ipynb b/tutorials/closed_loop_botorch_only/closed_loop_botorch_only.ipynb
index 56ef65a9d5..49f401df41 100644
--- a/tutorials/closed_loop_botorch_only/closed_loop_botorch_only.ipynb
+++ b/tutorials/closed_loop_botorch_only/closed_loop_botorch_only.ipynb
@@ -11,7 +11,7 @@
         "\n",
         "In this tutorial, we illustrate how to implement a simple Bayesian Optimization (BO) closed loop in BoTorch.\n",
         "\n",
-        "In general, we recommend for a relatively simple setup (like this one) to use Ax, since this will simplify your setup (including the amount of code you need to write) considerably. See the [Using BoTorch with Ax](./custom_botorch_model_in_ax) tutorial.\n",
+        "In general, we recommend for a relatively simple setup (like this one) to use Ax, since this will simplify your setup (including the amount of code you need to write) considerably. See the [Using BoTorch with Ax](/docs/tutorials/custom_botorch_model_in_ax) tutorial.\n",
         "\n",
         "However, you may want to do things that are not easily supported in Ax at this time (like running high-dimensional BO using a VAE+GP model that you jointly train on high-dimensional input data). If you find yourself in such a situation, you will need to write your own optimization loop, as we do in this tutorial.\n",
         "\n",
diff --git a/tutorials/constrained_multi_objective_bo/constrained_multi_objective_bo.ipynb b/tutorials/constrained_multi_objective_bo/constrained_multi_objective_bo.ipynb
index 8fc714ff9c..c7782960bd 100644
--- a/tutorials/constrained_multi_objective_bo/constrained_multi_objective_bo.ipynb
+++ b/tutorials/constrained_multi_objective_bo/constrained_multi_objective_bo.ipynb
@@ -11,7 +11,7 @@
         "\n",
         "In this tutorial, we illustrate how to implement a constrained multi-objective (MO) Bayesian Optimization (BO) closed loop in BoTorch.\n",
         "\n",
-        "In general, we recommend using [Ax](https://ax.dev) for a simple BO setup like this one, since this will simplify your setup (including the amount of code you need to write) considerably. See [here](https://ax.dev/tutorials/multiobjective_optimization.html) for an Ax tutorial on MOBO. If desired, you can use a custom BoTorch model in Ax, following the [Using BoTorch with Ax](./custom_botorch_model_in_ax) tutorial. Given a `MultiObjective`, Ax will default to the $q$NEHVI acquisiton function. If desired, this can also be customized by adding `\"botorch_acqf_class\": <desired_botorch_acquisition_function_class>,` to the `model_kwargs`.\n",
+        "In general, we recommend using [Ax](https://ax.dev) for a simple BO setup like this one, since this will simplify your setup (including the amount of code you need to write) considerably. See [here](https://ax.dev/docs/tutorials/multiobjective_optimization.html) for an Ax tutorial on MOBO. If desired, you can use a custom BoTorch model in Ax, following the [Using BoTorch with Ax](/docs/tutorials/custom_botorch_model_in_ax) tutorial. Given a `MultiObjective`, Ax will default to the $q$NEHVI acquisiton function. If desired, this can also be customized by adding `\"botorch_acqf_class\": <desired_botorch_acquisition_function_class>,` to the `model_kwargs`.\n",
         "\n",
         "We use the parallel ParEGO ($q$ParEGO) [1] and parallel Noisy Expected Hypervolume Improvement ($q$NEHVI) [2]  acquisition functions to optimize a synthetic C2-DTLZ2 test function with $M=2$ objectives, $V=1$ constraint, and $d=4$ parameters. The two objectives are\n",
         "$$f_1(\\mathbf x) = (1+ g(\\mathbf x_M))\\cos\\big(\\frac{\\pi}{2}x_1\\big)$$\n",
diff --git a/tutorials/custom_acquisition/custom_acquisition.ipynb b/tutorials/custom_acquisition/custom_acquisition.ipynb
index ff95b4863d..8d3d52ee05 100644
--- a/tutorials/custom_acquisition/custom_acquisition.ipynb
+++ b/tutorials/custom_acquisition/custom_acquisition.ipynb
@@ -618,7 +618,7 @@
       "source": [
         "### Setting up the experiment\n",
         "\n",
-        "We will set up a simple experiment to optimize a simple scalarization of the BraninCurrin function (per the weights above). A detailed tutorial on Service API can be found [here](https://ax.dev/tutorials/gpei_hartmann_service.html).\n",
+        "We will set up a simple experiment to optimize a simple scalarization of the BraninCurrin function (per the weights above). A detailed tutorial on Service API can be found [here](https://ax.dev/docs/tutorials/gpei_hartmann_service.html).\n",
         "\n",
         "In order to use the `GenerationStrategy` we just created, we will pass it into the `AxClient`."
       ]
diff --git a/tutorials/custom_botorch_model_in_ax/custom_botorch_model_in_ax.ipynb b/tutorials/custom_botorch_model_in_ax/custom_botorch_model_in_ax.ipynb
index 9c3aa59393..56c8532a05 100644
--- a/tutorials/custom_botorch_model_in_ax/custom_botorch_model_in_ax.ipynb
+++ b/tutorials/custom_botorch_model_in_ax/custom_botorch_model_in_ax.ipynb
@@ -16,9 +16,9 @@
     "\n",
     "In this tutorial, we illustrate how to use a custom BoTorch model within Ax's `botorch_modular` API. This allows us to harness the convenience of Ax for running Bayesian Optimization loops while maintaining full flexibility in modeling.\n",
     "\n",
-    "Acquisition functions and their optimizers can be swapped out in much the same fashion. See for example the tutorial for [Implementing a custom acquisition function](./custom_acquisition).\n",
+    "Acquisition functions and their optimizers can be swapped out in much the same fashion. See for example the tutorial for [Implementing a custom acquisition function](/docs/tutorials/custom_acquisition).\n",
     "\n",
-    "If you want to do something non-standard, or would like to have full insight into every aspect of the implementation, please see [this tutorial](./closed_loop_botorch_only) for how to write your own full optimization loop in BoTorch.\n"
+    "If you want to do something non-standard, or would like to have full insight into every aspect of the implementation, please see [this tutorial](/docs/tutorials/closed_loop_botorch_only) for how to write your own full optimization loop in BoTorch.\n"
    ]
   },
   {
@@ -89,7 +89,7 @@
     "\n",
     "Model definition is straightforward. Here we implement a GPyTorch `ExactGP` that inherits from `GPyTorchModel`; together these two superclasses add all the API calls that BoTorch expects in its various modules. \n",
     "\n",
-    "*Note:* BoTorch allows implementing any custom model that follows the `Model` API. For more information, please see the [Model Documentation](../docs/models)."
+    "*Note:* BoTorch allows implementing any custom model that follows the `Model` API. For more information, please see the [Model Documentation](/docs/models)."
    ]
   },
   {
@@ -279,7 +279,7 @@
    "source": [
     "## Optimization with Ax's Service API\n",
     "\n",
-    "A detailed tutorial on the Service API can be found [here](https://ax.dev/tutorials/gpei_hartmann_service.html).\n",
+    "A detailed tutorial on the Service API can be found [here](https://ax.dev/docs/tutorials/gpei_hartmann_service.html).\n",
     "\n",
     "In order to customize the way the candidates are created in the Service API, we need to construct a new `GenerationStrategy` and pass it into `AxClient`."
    ]
@@ -1659,7 +1659,7 @@
    "source": [
     "## Optimization with the Developer API\n",
     "\n",
-    "A detailed tutorial on the Service API can be found [here](https://ax.dev/tutorials/gpei_hartmann_developer.html).\n",
+    "A detailed tutorial on the Service API can be found [here](https://ax.dev/docs/tutorials/gpei_hartmann_developer.html).\n",
     "\n",
     "### Set up the Experiment in Ax\n",
     "\n",
diff --git a/tutorials/custom_model/custom_model.ipynb b/tutorials/custom_model/custom_model.ipynb
index 54de751106..7c9290e16d 100644
--- a/tutorials/custom_model/custom_model.ipynb
+++ b/tutorials/custom_model/custom_model.ipynb
@@ -10,7 +10,7 @@
     "- Posterior samples (using Pyro)\n",
     "- Ensemble of ML predictions\n",
     "\n",
-    "This tutorial differs from the [Using a custom BoTorch model with Ax](https://botorch.org/tutorials/custom_botorch_model_in_ax) tutorial by focusing more on authoring a new model that is compatible with the BoTorch and less on integrating a custom model with Ax's `botorch_modular` API."
+    "This tutorial differs from the [Using a custom BoTorch model with Ax](https://botorch.org/docs/tutorials/custom_botorch_model_in_ax) tutorial by focusing more on authoring a new model that is compatible with the BoTorch and less on integrating a custom model with Ax's `botorch_modular` API."
    ]
   },
   {
@@ -962,7 +962,7 @@
    "metadata": {},
    "source": [
     "#### CMA-ES\n",
-    "We can also move the optimization loop out of BoTorch entirely and follow the [CMA-ES tutorial](https://botorch.org/tutorials/optimize_with_cmaes) to optimize with an evolution strategy."
+    "We can also move the optimization loop out of BoTorch entirely and follow the [CMA-ES tutorial](https://botorch.org/docs/tutorials/optimize_with_cmaes) to optimize with an evolution strategy."
    ]
   },
   {
diff --git a/tutorials/discrete_multi_fidelity_bo/discrete_multi_fidelity_bo.ipynb b/tutorials/discrete_multi_fidelity_bo/discrete_multi_fidelity_bo.ipynb
index 65d0921291..a19f401cc1 100644
--- a/tutorials/discrete_multi_fidelity_bo/discrete_multi_fidelity_bo.ipynb
+++ b/tutorials/discrete_multi_fidelity_bo/discrete_multi_fidelity_bo.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "## Multi-Fidelity BO with Discrete Fidelities using KG\n",
     "\n",
-    "In this tutorial, we show how to do multi-fidelity BO with discrete fidelities based on [1], where each fidelity is a different \"information source.\" This tutorial uses the same setup as the [continuous multi-fidelity BO tutorial](https://botorch.org/tutorials/multi_fidelity_bo), except with discrete fidelity parameters that are interpreted as multiple information sources.\n",
+    "In this tutorial, we show how to do multi-fidelity BO with discrete fidelities based on [1], where each fidelity is a different \"information source.\" This tutorial uses the same setup as the [continuous multi-fidelity BO tutorial](https://botorch.org/docs/tutorials/multi_fidelity_bo), except with discrete fidelity parameters that are interpreted as multiple information sources.\n",
     "\n",
     "We use a GP model with a single task that models the design and fidelity parameters jointly. In some cases, where there is not a natural ordering in the fidelity space, it may be more appropriate to use a multi-task model (with, say, an ICM kernel). We will provide a tutorial once this functionality is in place.\n",
     "\n",
diff --git a/tutorials/fit_model_with_torch_optimizer/fit_model_with_torch_optimizer.ipynb b/tutorials/fit_model_with_torch_optimizer/fit_model_with_torch_optimizer.ipynb
index e1ac189d60..c5d2bf4447 100644
--- a/tutorials/fit_model_with_torch_optimizer/fit_model_with_torch_optimizer.ipynb
+++ b/tutorials/fit_model_with_torch_optimizer/fit_model_with_torch_optimizer.ipynb
@@ -282,7 +282,7 @@
       "source": [
         "### Interfacing with Ax\n",
         "\n",
-        "It is simple to package up a custom optimizer loop like the one above and use it within Ax. As described in the [Using BoTorch with Ax tutorial](./custom_botorch_model_in_ax), this requires defining a custom `model_constructor` callable that can then be passed to the `get_botorch` factory function."
+        "It is simple to package up a custom optimizer loop like the one above and use it within Ax. As described in the [Using BoTorch with Ax tutorial](/docs/tutorials/custom_botorch_model_in_ax), this requires defining a custom `model_constructor` callable that can then be passed to the `get_botorch` factory function."
       ]
     },
     {
diff --git a/tutorials/max_value_entropy/max_value_entropy.ipynb b/tutorials/max_value_entropy/max_value_entropy.ipynb
index 62141266e0..1c0b573e29 100644
--- a/tutorials/max_value_entropy/max_value_entropy.ipynb
+++ b/tutorials/max_value_entropy/max_value_entropy.ipynb
@@ -8,7 +8,7 @@
         "\n",
         "Max-value entropy search (MES) acquisition function quantifies the information gain about the maximum of a black-box function by observing this black-box function $f$ at the candidate set $\\{\\textbf{x}\\}$ (see [1, 2]). BoTorch provides implementations of the MES acquisition function and its multi-fidelity (MF) version with support for trace observations. In this tutorial, we explain at a high level how the MES acquisition function works, its implementation in BoTorch and how to use the MES acquisition function to query the next point in the optimization process. \n",
         "\n",
-        "In general, we recommend using [Ax](https://ax.dev) for a simple BO setup like this one, since this will simplify your setup (including the amount of code you need to write) considerably. You can use a custom BoTorch model and acquisition function in Ax, following the [Using BoTorch with Ax](./custom_botorch_model_in_ax) tutorial. To use the MES acquisition function, it is sufficient to add `\"botorch_acqf_class\": qMaxValueEntropy,` to `model_kwargs`. The linked tutorial shows how to use a custom BoTorch model. If you'd like to let Ax choose which model to use based on the properties of the search space, you can skip the `surrogate` argument in `model_kwargs`.\n",
+        "In general, we recommend using [Ax](https://ax.dev) for a simple BO setup like this one, since this will simplify your setup (including the amount of code you need to write) considerably. You can use a custom BoTorch model and acquisition function in Ax, following the [Using BoTorch with Ax](/docs/tutorials/custom_botorch_model_in_ax) tutorial. To use the MES acquisition function, it is sufficient to add `\"botorch_acqf_class\": qMaxValueEntropy,` to `model_kwargs`. The linked tutorial shows how to use a custom BoTorch model. If you'd like to let Ax choose which model to use based on the properties of the search space, you can skip the `surrogate` argument in `model_kwargs`.\n",
         "\n",
         "### 1. MES acquisition function for $q=1$ with noisy observation\n",
         "For illustrative purposes, we focus in this section on the non-q-batch-mode case ($q=1$). We also assume that the evaluation of the black-box function is noisy. Let us first introduce some notation: \n",
diff --git a/tutorials/multi_objective_bo/multi_objective_bo.ipynb b/tutorials/multi_objective_bo/multi_objective_bo.ipynb
index 1b460f9747..3bd4993411 100644
--- a/tutorials/multi_objective_bo/multi_objective_bo.ipynb
+++ b/tutorials/multi_objective_bo/multi_objective_bo.ipynb
@@ -11,7 +11,7 @@
         "\n",
         "In this tutorial, we illustrate how to implement a simple multi-objective (MO) Bayesian Optimization (BO) closed loop in BoTorch.\n",
         "\n",
-        "In general, we recommend using [Ax](https://ax.dev) for a simple BO setup like this one, since this will simplify your setup (including the amount of code you need to write) considerably. See [here](https://ax.dev/tutorials/multiobjective_optimization.html) for an Ax tutorial on MOBO. If desired, you can use a custom BoTorch model in Ax, following the [Using BoTorch with Ax](./custom_botorch_model_in_ax) tutorial. Given a `MultiObjective`, Ax will default to the $q$NEHVI acquisiton function. If desired, this can also be customized by adding `\"botorch_acqf_class\": <desired_botorch_acquisition_function_class>,` to the `model_kwargs`.\n",
+        "In general, we recommend using [Ax](https://ax.dev) for a simple BO setup like this one, since this will simplify your setup (including the amount of code you need to write) considerably. See [here](https://ax.dev/docs/tutorials/multiobjective_optimization.html) for an Ax tutorial on MOBO. If desired, you can use a custom BoTorch model in Ax, following the [Using BoTorch with Ax](/docs/tutorials/custom_botorch_model_in_ax) tutorial. Given a `MultiObjective`, Ax will default to the $q$NEHVI acquisiton function. If desired, this can also be customized by adding `\"botorch_acqf_class\": <desired_botorch_acquisition_function_class>,` to the `model_kwargs`.\n",
         "\n",
         "We use the parallel ParEGO ($q$ParEGO) [1], parallel Expected Hypervolume Improvement ($q$EHVI) [1], and parallel Noisy Expected Hypervolume Improvement ($q$NEHVI) [2]  acquisition functions to optimize a synthetic BraninCurrin problem test function with additive Gaussian observation noise over a 2-parameter search space [0,1]^2. See  `botorch/test_functions/multi_objective.py` for details on BraninCurrin. The noise standard deviations are 15.19 and 0.63 for each objective, respectively.\n",
         "\n",
diff --git a/tutorials/one_shot_kg/one_shot_kg.ipynb b/tutorials/one_shot_kg/one_shot_kg.ipynb
index 5f4414c91d..6ff9371fff 100644
--- a/tutorials/one_shot_kg/one_shot_kg.ipynb
+++ b/tutorials/one_shot_kg/one_shot_kg.ipynb
@@ -15,7 +15,7 @@
         "$$\n",
         "where $\\xi \\sim \\mathcal{P}(f(x') \\mid \\mathcal{D} \\cup \\mathcal{D}_{\\mathbf{x}})$ is the posterior at $x'$ conditioned on $\\mathcal{D}_{\\mathbf{x}}$, the (random) dataset observed at $\\mathbf{x}$, and $\\mu := \\max_{x}\\mathbb{E}[g(f(x)) \\mid \\mathcal{D}]$.\n",
         "\n",
-        "In general, we recommend using [Ax](https://ax.dev) for a simple BO setup like this one, since this will simplify your setup (including the amount of code you need to write) considerably. You can use a custom BoTorch model and acquisition function in Ax, following the [Using BoTorch with Ax](./custom_botorch_model_in_ax) tutorial. To use the KG acquisition function, it is sufficient to add `\"botorch_acqf_class\": qKnowledgeGradient,` to `model_kwargs`. The linked tutorial shows how to use a custom BoTorch model. If you'd like to let Ax choose which model to use based on the properties of the search space, you can skip the `surrogate` argument in `model_kwargs`.\n",
+        "In general, we recommend using [Ax](https://ax.dev) for a simple BO setup like this one, since this will simplify your setup (including the amount of code you need to write) considerably. You can use a custom BoTorch model and acquisition function in Ax, following the [Using BoTorch with Ax](/docs/tutorials/custom_botorch_model_in_ax) tutorial. To use the KG acquisition function, it is sufficient to add `\"botorch_acqf_class\": qKnowledgeGradient,` to `model_kwargs`. The linked tutorial shows how to use a custom BoTorch model. If you'd like to let Ax choose which model to use based on the properties of the search space, you can skip the `surrogate` argument in `model_kwargs`.\n",
         "\n",
         "\n",
         "#### Optimizing KG\n",
diff --git a/tutorials/optimize_stochastic/optimize_stochastic.ipynb b/tutorials/optimize_stochastic/optimize_stochastic.ipynb
index 965e1430c7..28b2f0511b 100644
--- a/tutorials/optimize_stochastic/optimize_stochastic.ipynb
+++ b/tutorials/optimize_stochastic/optimize_stochastic.ipynb
@@ -13,7 +13,7 @@
     "\n",
     "*Note:* A pre-packaged, more user-friendly version of the optimization loop we will develop below is contained in the `gen_candidates_torch` function in the `botorch.gen` module. This tutorial should be quite useful if you would like to implement custom optimizers beyond what is contained in `gen_candidates_torch`.\n",
     "\n",
-    "As discussed in the [CMA-ES tutorial](./optimize_with_cmaes), for deterministic acquisition functions BoTorch uses quasi-second order methods (such as L-BFGS-B or SLSQP) by default, which provide superior convergence speed in this situation. "
+    "As discussed in the [CMA-ES tutorial](/docs/tutorials/optimize_with_cmaes), for deterministic acquisition functions BoTorch uses quasi-second order methods (such as L-BFGS-B or SLSQP) by default, which provide superior convergence speed in this situation. "
    ]
   },
   {
diff --git a/tutorials/optimize_with_cmaes/optimize_with_cmaes.ipynb b/tutorials/optimize_with_cmaes/optimize_with_cmaes.ipynb
index ffe00a5353..72fb07030e 100644
--- a/tutorials/optimize_with_cmaes/optimize_with_cmaes.ipynb
+++ b/tutorials/optimize_with_cmaes/optimize_with_cmaes.ipynb
@@ -19,7 +19,7 @@
       "source": [
         "### Setting up the acquisition function\n",
         "\n",
-        "For the purpose of this tutorial, we'll use a basic `UpperConfidenceBound` acquisition function on a basic model fit on synthetic data. Please see the documentation for [Models](../docs/models) and [Acquisition Functions](../docs/acquisition) for more information."
+        "For the purpose of this tutorial, we'll use a basic `UpperConfidenceBound` acquisition function on a basic model fit on synthetic data. Please see the documentation for [Models](/docs/models) and [Acquisition Functions](/docs/acquisition) for more information."
       ]
     },
     {
diff --git a/tutorials/risk_averse_bo_with_input_perturbations/risk_averse_bo_with_input_perturbations.ipynb b/tutorials/risk_averse_bo_with_input_perturbations/risk_averse_bo_with_input_perturbations.ipynb
index 055ec9d3e6..0387cabfb3 100644
--- a/tutorials/risk_averse_bo_with_input_perturbations/risk_averse_bo_with_input_perturbations.ipynb
+++ b/tutorials/risk_averse_bo_with_input_perturbations/risk_averse_bo_with_input_perturbations.ipynb
@@ -18,7 +18,7 @@
         "\n",
         "In this setting, we want to find high-performing designs that are also robust to the effects of the input perturbations. \n",
         "To do so, we will follow the Bayesian optimization of risk measures framework introduced in [1]. \n",
-        "Please refer to the [Risk averse Bayesian optimization with environmental variables](https://botorch.org/tutorials/risk_averse_bo_with_environmental_variables) notebook for additional background on this.\n",
+        "Please refer to the [Risk averse Bayesian optimization with environmental variables](https://botorch.org/docs/tutorials/risk_averse_bo_with_environmental_variables) notebook for additional background on this.\n",
         "\n",
         "In this notebook, we will use the `qNoisyExpectedImprovement` acquisition function to optimize the VaR risk measure at risk level $\\alpha=0.8$, computed w.r.t. the perturbations in the inputs. To do so, we will:\n",
         " - Use `InputPerturbation` input transform to add a set of samples of $\\Delta_x$ to each given $x$;\n",
diff --git a/tutorials/saasbo/saasbo.ipynb b/tutorials/saasbo/saasbo.ipynb
index 693b96009a..4ea4ee9475 100644
--- a/tutorials/saasbo/saasbo.ipynb
+++ b/tutorials/saasbo/saasbo.ipynb
@@ -27,7 +27,7 @@
         "cubically with the number of datapoints. Depending on the problem, using more than a few hundred\n",
         "evaluations may not be feasible as SAASBO is designed for problems with a limited evaluation budget.\n",
         "\n",
-        "In general, we recommend using [Ax](https://ax.dev) for a simple BO setup like this one. See [here](https://ax.dev/tutorials/saasbo.html) for a SAASBO tutorial in Ax, which uses the Log Noisy Expected Improvement acquisition function. Therefore, this tutorial shows a minimal illustrative example of how to use SAASBO with only BoTorch. To customize the acquisition function used with SAASBO in Ax, see the [custom acquisition tutorial](./custom_acquisition), where adding `\\\"surrogate\\\": Surrogate(SaasFullyBayesianSingleTaskGP),` to the `model_kwargs` of `BOTORCH_MODULAR` step is sufficient to enable the SAAS model.\n",
+        "In general, we recommend using [Ax](https://ax.dev) for a simple BO setup like this one. See [here](https://ax.dev/docs/tutorials/saasbo.html) for a SAASBO tutorial in Ax, which uses the Log Noisy Expected Improvement acquisition function. Therefore, this tutorial shows a minimal illustrative example of how to use SAASBO with only BoTorch. To customize the acquisition function used with SAASBO in Ax, see the [custom acquisition tutorial](/docs/tutorials/custom_acquisition), where adding `\\\"surrogate\\\": Surrogate(SaasFullyBayesianSingleTaskGP),` to the `model_kwargs` of `BOTORCH_MODULAR` step is sufficient to enable the SAAS model.\n",
         "\n",
         "[1]: [D. Eriksson, M. Jankowiak. High-Dimensional Bayesian Optimization with Sparse Axis-Aligned Subspaces. Proceedings of the Thirty-Seventh Conference on Uncertainty in Artificial Intelligence, 2021.](https://proceedings.mlr.press/v161/eriksson21a.html)"
       ]
diff --git a/tutorials/scalable_constrained_bo/scalable_constrained_bo.ipynb b/tutorials/scalable_constrained_bo/scalable_constrained_bo.ipynb
index d88e8a49f9..5374ae6e49 100644
--- a/tutorials/scalable_constrained_bo/scalable_constrained_bo.ipynb
+++ b/tutorials/scalable_constrained_bo/scalable_constrained_bo.ipynb
@@ -12,7 +12,7 @@
     "[1]: David Eriksson and Matthias Poloczek. Scalable constrained Bayesian optimization. In International Conference on Artificial Intelligence and Statistics, pages 730–738. PMLR, 2021.\n",
     "(https://doi.org/10.48550/arxiv.2002.08526)\n",
     "\n",
-    "Since SCBO is essentially a constrained version of Trust Region Bayesian Optimization (TuRBO), this tutorial shares much of the same code as the TuRBO Tutorial (https://botorch.org/tutorials/turbo_1) with small modifications made to implement SCBO."
+    "Since SCBO is essentially a constrained version of Trust Region Bayesian Optimization (TuRBO), this tutorial shares much of the same code as the TuRBO Tutorial (https://botorch.org/docs/tutorials/turbo_1) with small modifications made to implement SCBO."
    ]
   },
   {
@@ -142,7 +142,7 @@
    "source": [
     "## Define TuRBO Class\n",
     "\n",
-    "Just as in the TuRBO Tutorial (https://botorch.org/tutorials/turbo_1), we'll define a class to hold the turst region state and a method update_state() to update the side length of the trust region hyper-cube during optimization. We'll update the side length according to the number of sequential successes or failures as discussed in the original TuRBO paper. "
+    "Just as in the TuRBO Tutorial (https://botorch.org/docs/tutorials/turbo_1), we'll define a class to hold the turst region state and a method update_state() to update the side length of the trust region hyper-cube during optimization. We'll update the side length according to the number of sequential successes or failures as discussed in the original TuRBO paper. "
    ]
   },
   {
@@ -286,7 +286,7 @@
    "source": [
     "### Generating a batch of candidates for SCBO \n",
     "\n",
-    "Just as in the TuRBO Tutorial (https://botorch.org/tutorials/turbo_1), we'll define a method generate_batch to generate a new batch of candidate points within the TuRBO trust region using Thompson sampling. \n",
+    "Just as in the TuRBO Tutorial (https://botorch.org/docs/tutorials/turbo_1), we'll define a method generate_batch to generate a new batch of candidate points within the TuRBO trust region using Thompson sampling. \n",
     "\n",
     "The key difference here from TuRBO is that, instead of using MaxPosteriorSampling to simply grab the candidates within the trust region with the maximum posterior values, we use ConstrainedMaxPosteriorSampling to instead grab the candidates within the trust region with the maximum posterior values subject to the constraint that the posteriors for the constraint models for c1(x) and c2(x) must be less than or equal to 0 for both candidates. \n",
     "\n",
diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js
index a9d7b7d9a1..095a1863c4 100644
--- a/website/docusaurus.config.js
+++ b/website/docusaurus.config.js
@@ -31,8 +31,8 @@ module.exports={
     "users": [],
     "wrapPagesHTML": true
   },
-  "onBrokenLinks": "log",
-  "onBrokenMarkdownLinks": "log",
+  "onBrokenLinks": "throw",
+  "onBrokenMarkdownLinks": "warn",
   "presets": [
     [
       "@docusaurus/preset-classic",
diff --git a/website/src/pages/index.js b/website/src/pages/index.js
index 12a4d16a48..5d0ad9f518 100755
--- a/website/src/pages/index.js
+++ b/website/src/pages/index.js
@@ -13,6 +13,7 @@ import Layout from "@theme/Layout";
 import useBaseUrl from '@docusaurus/useBaseUrl';
 import useDocusaurusContext from '@docusaurus/useDocusaurusContext';
 import CodeBlock from '@theme/CodeBlock';
+import Heading from '@theme/Heading';
 
 const features = [
     {
@@ -94,7 +95,7 @@ const HomeSplash = () => {
           Introduction
         </Link>
         <Link
-          to="#quickstart"
+          to="/#quickstart"
           className="button button--lg button--outline button--secondary margin--sm">
           Get started
         </Link>
@@ -149,14 +150,13 @@ candidate  # tensor([[0.2981, 0.2401]], dtype=torch.float64)`;
   year = 2020,
   url = {http://arxiv.org/abs/1910.06403}
 }`;
-  //
+
   const QuickStart = () => (
     <div
       className="padding--xl"
-      id="quickstart"
       style={{ 'background-color': 'var(--ifm-color-emphasis-200)' }}
     >
-      <h2 className='text--center padding--md'>Get Started</h2>
+      <Heading as="h2" className='text--center padding--md' id="quickstart">Get Started</Heading>
       <div>
         <ol>
           <li>