2021-01-22 12:29:04 -07:00
// SPDX-License-Identifier: MIT
2019-10-21 10:43:39 -07:00
/*
* Copyright © 2019 Intel Corporation
*/
2020-06-18 08:04:02 -07:00
# include "i915_drv.h"
2019-10-21 10:43:39 -07:00
# include "i915_request.h"
# include "intel_context.h"
# include "intel_engine_heartbeat.h"
# include "intel_engine_pm.h"
# include "intel_engine.h"
# include "intel_gt.h"
2019-10-23 06:31:08 -07:00
# include "intel_reset.h"
/*
* While the engine is active , we send a periodic pulse along the engine
* to check on its health and to flush any idle - barriers . If that request
* is stuck , and we fail to preempt it , we declare the engine hung and
* issue a reset - - in the hope that restores progress .
*/
static bool next_heartbeat ( struct intel_engine_cs * engine )
{
2022-10-06 14:38:12 -07:00
struct i915_request * rq ;
2019-10-23 06:31:08 -07:00
long delay ;
delay = READ_ONCE ( engine - > props . heartbeat_interval_ms ) ;
2022-10-06 14:38:12 -07:00
rq = engine - > heartbeat . systole ;
/*
* FIXME : The final period extension is disabled if the period has been
* modified from the default . This is to prevent issues with certain
* selftests which override the value and expect specific behaviour .
* Once the selftests have been updated to either cope with variable
* heartbeat periods ( or to override the pre - emption timeout as well ,
* or just to add a selftest specific override of the extension ) , the
* generic override can be removed .
*/
if ( rq & & rq - > sched . attr . priority > = I915_PRIORITY_BARRIER & &
delay = = engine - > defaults . heartbeat_interval_ms ) {
long longer ;
/*
* The final try is at the highest priority possible . Up until now
* a pre - emption might not even have been attempted . So make sure
* this last attempt allows enough time for a pre - emption to occur .
*/
longer = READ_ONCE ( engine - > props . preempt_timeout_ms ) * 2 ;
longer = intel_clamp_heartbeat_interval_ms ( engine , longer ) ;
if ( longer > delay )
delay = longer ;
}
2019-10-23 06:31:08 -07:00
if ( ! delay )
return false ;
delay = msecs_to_jiffies_timeout ( delay ) ;
if ( delay > = HZ )
delay = round_jiffies_up_relative ( delay ) ;
2021-02-04 14:13:03 -07:00
mod_delayed_work ( system_highpri_wq , & engine - > heartbeat . work , delay + 1 ) ;
2019-10-23 06:31:08 -07:00
return true ;
}
2019-10-21 10:43:39 -07:00
2020-12-24 09:02:13 -07:00
static struct i915_request *
heartbeat_create ( struct intel_context * ce , gfp_t gfp )
{
struct i915_request * rq ;
intel_context_enter ( ce ) ;
rq = __i915_request_create ( ce , gfp ) ;
intel_context_exit ( ce ) ;
return rq ;
}
2019-10-21 10:43:39 -07:00
static void idle_pulse ( struct intel_engine_cs * engine , struct i915_request * rq )
{
engine - > wakeref_serial = READ_ONCE ( engine - > serial ) + 1 ;
i915_request_add_active_barriers ( rq ) ;
2020-10-06 02:46:53 -07:00
if ( ! engine - > heartbeat . systole & & intel_engine_has_heartbeat ( engine ) )
engine - > heartbeat . systole = i915_request_get ( rq ) ;
2019-10-21 10:43:39 -07:00
}
2020-12-24 09:02:13 -07:00
static void heartbeat_commit ( struct i915_request * rq ,
const struct i915_sched_attr * attr )
{
idle_pulse ( rq - > engine , rq ) ;
__i915_request_commit ( rq ) ;
__i915_request_queue ( rq , attr ) ;
}
2019-10-23 06:31:08 -07:00
static void show_heartbeat ( const struct i915_request * rq ,
struct intel_engine_cs * engine )
{
2024-02-19 06:14:23 -07:00
struct drm_printer p =
drm_dbg_printer ( & engine - > i915 - > drm , DRM_UT_DRIVER , " heartbeat " ) ;
2019-10-23 06:31:08 -07:00
2021-07-26 17:23:24 -07:00
if ( ! rq ) {
intel_engine_dump ( engine , & p ,
" %s heartbeat not ticking \n " ,
engine - > name ) ;
} else {
intel_engine_dump ( engine , & p ,
" %s heartbeat {seqno:%llx:%lld, prio:%d} not ticking \n " ,
engine - > name ,
rq - > fence . context ,
rq - > fence . seqno ,
rq - > sched . attr . priority ) ;
}
}
static void
reset_engine ( struct intel_engine_cs * engine , struct i915_request * rq )
{
if ( IS_ENABLED ( CONFIG_DRM_I915_DEBUG_GEM ) )
show_heartbeat ( rq , engine ) ;
2021-07-26 17:23:34 -07:00
if ( intel_engine_uses_guc ( engine ) )
/*
* GuC itself is toast or GuC ' s hang detection
* is disabled . Either way , need to find the
* hang culprit manually .
*/
intel_guc_find_hung_context ( engine ) ;
2021-07-26 17:23:24 -07:00
intel_gt_handle_error ( engine - > gt , engine - > mask ,
I915_ERROR_CAPTURE ,
" stopped heartbeat on %s " ,
engine - > name ) ;
2019-10-23 06:31:08 -07:00
}
static void heartbeat ( struct work_struct * wrk )
{
2021-01-20 05:14:39 -07:00
struct i915_sched_attr attr = { . priority = I915_PRIORITY_MIN } ;
2019-10-23 06:31:08 -07:00
struct intel_engine_cs * engine =
container_of ( wrk , typeof ( * engine ) , heartbeat . work . work ) ;
struct intel_context * ce = engine - > kernel_context ;
struct i915_request * rq ;
2020-07-02 02:52:18 -07:00
unsigned long serial ;
2019-10-23 06:31:08 -07:00
2020-06-15 09:50:13 -07:00
/* Just in case everything has gone horribly wrong, give it a kick */
intel_engine_flush_submission ( engine ) ;
2019-10-23 06:31:08 -07:00
rq = engine - > heartbeat . systole ;
if ( rq & & i915_request_completed ( rq ) ) {
i915_request_put ( rq ) ;
engine - > heartbeat . systole = NULL ;
}
2019-11-06 15:34:10 -07:00
if ( ! intel_engine_pm_get_if_awake ( engine ) )
return ;
2019-10-23 06:31:08 -07:00
if ( intel_gt_is_wedged ( engine - > gt ) )
goto out ;
2021-07-26 17:23:24 -07:00
if ( i915_sched_engine_disabled ( engine - > sched_engine ) ) {
reset_engine ( engine , engine - > heartbeat . systole ) ;
goto out ;
}
2019-10-23 06:31:08 -07:00
if ( engine - > heartbeat . systole ) {
2021-02-04 14:13:03 -07:00
long delay = READ_ONCE ( engine - > props . heartbeat_interval_ms ) ;
/* Safeguard against too-fast worker invocations */
if ( ! time_after ( jiffies ,
rq - > emitted_jiffies + msecs_to_jiffies ( delay ) ) )
goto out ;
2020-05-28 00:41:00 -07:00
if ( ! i915_sw_fence_signaled ( & rq - > submit ) ) {
/*
* Not yet submitted , system is stalled .
*
* This more often happens for ring submission ,
* where all contexts are funnelled into a common
* ringbuffer . If one context is blocked on an
* external fence , not only is it not submitted ,
* but all other contexts , including the kernel
* context are stuck waiting for the signal .
*/
2021-06-17 18:06:35 -07:00
} else if ( engine - > sched_engine - > schedule & &
2020-05-28 00:41:00 -07:00
rq - > sched . attr . priority < I915_PRIORITY_BARRIER ) {
2019-10-23 06:31:08 -07:00
/*
* Gradually raise the priority of the heartbeat to
* give high priority work [ which presumably desires
* low latency and no jitter ] the chance to naturally
* complete before being preempted .
*/
2023-10-23 05:13:05 -07:00
attr . priority = I915_PRIORITY_NORMAL ;
2019-10-23 06:31:08 -07:00
if ( rq - > sched . attr . priority > = attr . priority )
2021-01-20 05:14:39 -07:00
attr . priority = I915_PRIORITY_HEARTBEAT ;
2019-10-23 06:31:08 -07:00
if ( rq - > sched . attr . priority > = attr . priority )
attr . priority = I915_PRIORITY_BARRIER ;
local_bh_disable ( ) ;
2021-06-17 18:06:35 -07:00
engine - > sched_engine - > schedule ( rq , & attr ) ;
2019-10-23 06:31:08 -07:00
local_bh_enable ( ) ;
} else {
2021-07-26 17:23:24 -07:00
reset_engine ( engine , rq ) ;
2019-10-23 06:31:08 -07:00
}
2021-02-04 14:13:03 -07:00
rq - > emitted_jiffies = jiffies ;
2019-10-23 06:31:08 -07:00
goto out ;
}
2020-07-02 02:52:18 -07:00
serial = READ_ONCE ( engine - > serial ) ;
if ( engine - > wakeref_serial = = serial )
2019-10-23 06:31:08 -07:00
goto out ;
2020-07-02 02:52:18 -07:00
if ( ! mutex_trylock ( & ce - > timeline - > mutex ) ) {
/* Unable to lock the kernel timeline, is the engine stuck? */
if ( xchg ( & engine - > heartbeat . blocked , serial ) = = serial )
intel_gt_handle_error ( engine - > gt , engine - > mask ,
I915_ERROR_CAPTURE ,
" no heartbeat on %s " ,
engine - > name ) ;
goto out ;
}
2019-10-23 06:31:08 -07:00
2020-12-24 09:02:13 -07:00
rq = heartbeat_create ( ce , GFP_NOWAIT | __GFP_NOWARN ) ;
2019-10-23 06:31:08 -07:00
if ( IS_ERR ( rq ) )
goto unlock ;
2020-12-24 09:02:13 -07:00
heartbeat_commit ( rq , & attr ) ;
2019-10-23 06:31:08 -07:00
unlock :
mutex_unlock ( & ce - > timeline - > mutex ) ;
out :
2020-10-06 02:46:53 -07:00
if ( ! engine - > i915 - > params . enable_hangcheck | | ! next_heartbeat ( engine ) )
2019-10-23 06:31:08 -07:00
i915_request_put ( fetch_and_zero ( & engine - > heartbeat . systole ) ) ;
intel_engine_pm_put ( engine ) ;
}
void intel_engine_unpark_heartbeat ( struct intel_engine_cs * engine )
{
2021-10-05 10:17:28 -07:00
if ( ! CONFIG_DRM_I915_HEARTBEAT_INTERVAL )
2019-10-23 06:31:08 -07:00
return ;
next_heartbeat ( engine ) ;
}
void intel_engine_park_heartbeat ( struct intel_engine_cs * engine )
{
2019-11-06 06:31:29 -07:00
if ( cancel_delayed_work ( & engine - > heartbeat . work ) )
i915_request_put ( fetch_and_zero ( & engine - > heartbeat . systole ) ) ;
2019-10-23 06:31:08 -07:00
}
2021-07-26 17:23:24 -07:00
void intel_gt_unpark_heartbeats ( struct intel_gt * gt )
{
struct intel_engine_cs * engine ;
enum intel_engine_id id ;
for_each_engine ( engine , gt , id )
if ( intel_engine_pm_is_awake ( engine ) )
intel_engine_unpark_heartbeat ( engine ) ;
}
void intel_gt_park_heartbeats ( struct intel_gt * gt )
{
struct intel_engine_cs * engine ;
enum intel_engine_id id ;
for_each_engine ( engine , gt , id )
intel_engine_park_heartbeat ( engine ) ;
}
2019-10-23 06:31:08 -07:00
void intel_engine_init_heartbeat ( struct intel_engine_cs * engine )
{
INIT_DELAYED_WORK ( & engine - > heartbeat . work , heartbeat ) ;
}
2020-09-28 15:15:09 -07:00
static int __intel_engine_pulse ( struct intel_engine_cs * engine )
{
struct i915_sched_attr attr = { . priority = I915_PRIORITY_BARRIER } ;
struct intel_context * ce = engine - > kernel_context ;
struct i915_request * rq ;
lockdep_assert_held ( & ce - > timeline - > mutex ) ;
GEM_BUG_ON ( ! intel_engine_has_preemption ( engine ) ) ;
GEM_BUG_ON ( ! intel_engine_pm_is_awake ( engine ) ) ;
2020-12-24 09:02:13 -07:00
rq = heartbeat_create ( ce , GFP_NOWAIT | __GFP_NOWARN ) ;
2020-09-28 15:15:09 -07:00
if ( IS_ERR ( rq ) )
return PTR_ERR ( rq ) ;
__set_bit ( I915_FENCE_FLAG_SENTINEL , & rq - > fence . flags ) ;
2020-12-24 09:02:13 -07:00
heartbeat_commit ( rq , & attr ) ;
2020-09-28 15:15:09 -07:00
GEM_BUG_ON ( rq - > sched . attr . priority < I915_PRIORITY_BARRIER ) ;
2024-01-10 14:02:16 -07:00
/* Ensure the forced pulse gets a full period to execute */
next_heartbeat ( engine ) ;
2020-09-28 15:15:09 -07:00
return 0 ;
}
static unsigned long set_heartbeat ( struct intel_engine_cs * engine ,
unsigned long delay )
{
unsigned long old ;
old = xchg ( & engine - > props . heartbeat_interval_ms , delay ) ;
if ( delay )
intel_engine_unpark_heartbeat ( engine ) ;
else
intel_engine_park_heartbeat ( engine ) ;
return old ;
}
2019-10-23 06:31:08 -07:00
int intel_engine_set_heartbeat ( struct intel_engine_cs * engine ,
unsigned long delay )
{
2020-09-28 15:15:09 -07:00
struct intel_context * ce = engine - > kernel_context ;
int err = 0 ;
2019-10-23 06:31:08 -07:00
2020-09-28 15:15:09 -07:00
if ( ! delay & & ! intel_engine_has_preempt_reset ( engine ) )
return - ENODEV ;
2022-10-06 14:38:12 -07:00
/* FIXME: Remove together with equally marked hack in next_heartbeat. */
if ( delay ! = engine - > defaults . heartbeat_interval_ms & &
delay < 2 * engine - > props . preempt_timeout_ms ) {
if ( intel_engine_uses_guc ( engine ) )
drm_notice ( & engine - > i915 - > drm , " %s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets! \n " ,
engine - > name ) ;
else
drm_notice ( & engine - > i915 - > drm , " %s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts! \n " ,
engine - > name ) ;
}
2020-09-28 15:15:09 -07:00
intel_engine_pm_get ( engine ) ;
err = mutex_lock_interruptible ( & ce - > timeline - > mutex ) ;
if ( err )
goto out_rpm ;
2019-10-23 06:31:08 -07:00
2020-09-28 15:15:09 -07:00
if ( delay ! = engine - > props . heartbeat_interval_ms ) {
unsigned long saved = set_heartbeat ( engine , delay ) ;
2019-10-23 06:31:08 -07:00
2020-09-28 15:15:09 -07:00
/* recheck current execution */
if ( intel_engine_has_preemption ( engine ) ) {
err = __intel_engine_pulse ( engine ) ;
if ( err )
set_heartbeat ( engine , saved ) ;
}
2019-10-23 06:31:08 -07:00
}
2020-09-28 15:15:09 -07:00
mutex_unlock ( & ce - > timeline - > mutex ) ;
out_rpm :
intel_engine_pm_put ( engine ) ;
return err ;
2019-10-23 06:31:08 -07:00
}
2019-10-21 10:43:39 -07:00
int intel_engine_pulse ( struct intel_engine_cs * engine )
{
struct intel_context * ce = engine - > kernel_context ;
2020-02-18 09:21:42 -07:00
int err ;
2019-10-21 10:43:39 -07:00
if ( ! intel_engine_has_preemption ( engine ) )
return - ENODEV ;
if ( ! intel_engine_pm_get_if_awake ( engine ) )
return 0 ;
2020-09-28 15:15:09 -07:00
err = - EINTR ;
if ( ! mutex_lock_interruptible ( & ce - > timeline - > mutex ) ) {
err = __intel_engine_pulse ( engine ) ;
mutex_unlock ( & ce - > timeline - > mutex ) ;
2019-10-21 10:43:39 -07:00
}
2021-03-24 05:13:30 -07:00
intel_engine_flush_submission ( engine ) ;
2019-10-21 10:43:39 -07:00
intel_engine_pm_put ( engine ) ;
return err ;
}
int intel_engine_flush_barriers ( struct intel_engine_cs * engine )
{
2021-01-20 05:14:39 -07:00
struct i915_sched_attr attr = { . priority = I915_PRIORITY_MIN } ;
2020-12-24 09:02:13 -07:00
struct intel_context * ce = engine - > kernel_context ;
2019-10-21 10:43:39 -07:00
struct i915_request * rq ;
2020-12-24 09:02:13 -07:00
int err ;
2019-10-21 10:43:39 -07:00
if ( llist_empty ( & engine - > barrier_tasks ) )
return 0 ;
2019-11-25 03:58:56 -07:00
if ( ! intel_engine_pm_get_if_awake ( engine ) )
return 0 ;
2020-12-24 09:02:13 -07:00
if ( mutex_lock_interruptible ( & ce - > timeline - > mutex ) ) {
err = - EINTR ;
goto out_rpm ;
}
rq = heartbeat_create ( ce , GFP_KERNEL ) ;
2019-11-25 03:58:56 -07:00
if ( IS_ERR ( rq ) ) {
err = PTR_ERR ( rq ) ;
2020-12-24 09:02:13 -07:00
goto out_unlock ;
2019-11-25 03:58:56 -07:00
}
2019-10-21 10:43:39 -07:00
2020-12-24 09:02:13 -07:00
heartbeat_commit ( rq , & attr ) ;
2019-10-21 10:43:39 -07:00
2020-12-24 09:02:13 -07:00
err = 0 ;
out_unlock :
mutex_unlock ( & ce - > timeline - > mutex ) ;
2019-11-25 03:58:56 -07:00
out_rpm :
intel_engine_pm_put ( engine ) ;
return err ;
2019-10-21 10:43:39 -07:00
}
# if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
# include "selftest_engine_heartbeat.c"
# endif