Setting up Jupyterhub with sparkmagic plugin

Hi,
We are trying to setup jupyterhub on EMR.
We’ve customized the jupyterhub_config file for sparkmagic plugin to work for each user.

Scenario:

user sjoshi_motor is trying to start a pyspark session on jupyerhub notebook,
but it tries to access /home/yuvraj/.sparkmagic (another user’s folder who is already logged in) and gives permission error (which it should!)

Jupyterhub is running on the master node of the cluster,
inside a docker container provided by EMR themselves.

#jupyterhub_config.py
# Configuration file for jupyterhub.

import os
import logging

notebook_dir = os.environ.get('DOCKER_NOTEBOOK_DIR')
network_name='jupyterhub-network'
s3_endpoint_url = os.environ.get('S3_ENDPOINT_URL', 's3.amazonaws.com')

c.Spawner.debug = True
#c.Spawner.environment = {'SPARKMAGIC_CONF_DIR':'/etc/jupyter/conf', 'JUPYTER_ENABLE_LAB': 'yes', 'S3_ENDPOINT_URL': s3_endpoint_url}

c.JupyterHub.hub_ip = '0.0.0.0'
c.JupyterHub.admin_access = True
c.JupyterHub.ssl_key = '/etc/jupyter/conf/server.key'
c.JupyterHub.ssl_cert = '/etc/jupyter/conf/server.crt'
c.JupyterHub.port = 9443

c.Authenticator.admin_users = {'jovyan'}

#USER ADDED


def get_sparkmagic_config(hostname, username):

    home_dir = os.path.expanduser(f'~{username}')
    sparkmagic_dir = os.path.join(home_dir, '.sparkmagic')
    
    config = {
        "kernel_python_credentials": {
            "username": "",
            "password": "",
            "url": f"http://{hostname}:8998",
            "auth": "None"
        },
        "kernel_scala_credentials": {
            "username": "",
            "password": "",
            "url": f"http://{hostname}:8998",
            "auth": "None"
        },
        "kernel_r_credentials": {
            "username": "",
            "password": "",
            "url": f"http://{hostname}:8998"
        },
        "logging_config": {
            "version": 1,
            "formatters": {
                "magicsFormatter": {
                    "format": "%(asctime)s\t%(levelname)s\t%(message)s",
                    "datefmt": ""
                }
            },
            "handlers": {
                "magicsHandler": {
                    "class": "hdijupyterutils.filehandler.MagicsFileHandler",
                    "formatter": "magicsFormatter",
                    "home_path": sparkmagic_dir
                }
            },
            "loggers": {
                "magicsLogger": {
                    "handlers": ["magicsHandler"],
                    "level": "DEBUG",
                    "propagate": 0
                }
            }
        },
        "wait_for_idle_timeout_seconds": 1200,
        "livy_session_startup_timeout_seconds": 320,
        "fatal_error_suggestion": "The code failed because of a fatal error:\n\t{}.\n\nSome things to try:\na) Make sure Spark has enough available resources for Jupyter to create a Spark context.\nb) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.\nc) Restart the kernel.",
        "ignore_ssl_errors": "false",
        "session_configs": {
            "driverMemory": "2G",
            "executorCores": 2,
            "executorMemory": "2G",
            "numExecutors": 6,
            "conf": {
                "spark.jars.packages": "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0,org.apache.spark:spark-avro_2.13:3.5.1,org.apache.httpcomponents.client5:httpclient5:5.4.1,com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.41.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1",
                "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
                "spark.sql.hive.convertMetastoreParquet": "true",
                "spark.hadoop.mapreduce.input.pathFilter.class": "org.apache.hudi.hadoop.HoodieROTablePathFilter",
                "spark.dynamicAllocation.enabled": "false",
                "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog",
                "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension",
                "spark.databricks.delta.schema.autoMerge": "true",
                "spark.databricks.delta.retentionDurationCheck.enabled": "false",
                "spark.databricks.delta.merge.repartitionBeforeWrite.enabled": "true",
                "spark.sql.broadcastTimeout": 36000,
                "spark.app.name": f'jupyterhub_session_{username}',
                "spark.sql.legacy.allowNonEmptyLocationInCTAS": "true",
                "spark.sql.legacy.parquet.datetimeRebaseModeInWrite": "CORRECTED",
                "spark.sql.legacy.parquet.datetimeRebaseModeInRead": "LEGACY",
                "spark.sql.legacy.timeParserPolicy": "LEGACY"
            }
        },
        "use_auto_viz": "true",
        "coerce_dataframe": "true",
        "max_results_sql": 2500,
        "pyspark_dataframe_encoding": "utf-8",
        "heartbeat_refresh_seconds": 30,
        "livy_server_heartbeat_timeout_seconds": 0,
        "heartbeat_retry_seconds": 10,
        "server_extension_default_kernel_name": "pysparkkernel",
        "custom_headers": {},
        "retry_policy": "configurable",
        "retry_seconds_to_sleep_list": [
            0.2,
            0.5,
            1.0,
            3.0,
            5.0
        ],
        "configurable_retry_policy_max_retries": 8
    }

    return config

def pre_spawn_hook(spawner):
    import os
    import json

    logging.basicConfig(level=logging.DEBUG)
    logger = logging.getLogger('jupyterhub-spawner')

    # Log all spawner attributes
    logger.debug('='*50)
    logger.debug('Spawner Debug Information:')
    logger.debug(f'Spawner type: {type(spawner)}')
    logger.debug(f'Spawner attributes: {dir(spawner)}')
    logger.debug(f'User object: {spawner.user}')
    logger.debug(f'User name: {spawner.user.name}')
    logger.debug(f'User server: {spawner.user.server}')
    logger.debug(f'User state: {spawner.user.state}')

    # Log environment variables
    logger.debug('Environment Variables:')
    for key, value in os.environ.items():
        if 'USER' in key or 'HOME' in key:
            logger.debug(f'{key}: {value}')


    # Create sparkmagic config directory if it doesn't exist
    print('$'* 30)
    print('pre_spawn_hook: user:',spawner.user.name)
    sparkmagic_dir = os.path.join('/home', spawner.user.name, '.sparkmagic')
    os.makedirs(sparkmagic_dir, exist_ok=True)

    logger.debug('='*50)
    logger.debug(f'pre_spawn_hook: user: {spawner.user.name}')

    c.Spawner.environment = {
            'SPARKMAGIC_CONF_DIR': sparkmagic_dir, 
            'JUPYTER_ENABLE_LAB': 'yes', 
            'S3_ENDPOINT_URL': s3_endpoint_url, 
            'USER': spawner.user.name, 
            'HOME': f'/home/{spawner.user.name}'
        }

    logger.debug('='*50)
    logger.debug('Environment set:')
    for key, value in c.Spawner.environment.items():
        logger.debug(f'{key}: {value}')

    # Write sparkmagic config
    hostname='emr_url'
    config = get_sparkmagic_config(hostname, spawner.user.name)
    with open(os.path.join(sparkmagic_dir, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4)
        os.system(f'chown -R {spawner.user.name}:users {sparkmagic_dir}')

    logger.debug('Config file created and permissions set')
    logger.debug('='*50)

c.Spawner.pre_spawn_hook = pre_spawn_hook

c.Spawner.mem_limit = '512M'
c.Spawner.cpu_limit = 1

We logged some variables because we thought env variables are not set right, but they are:

DEBUG:jupyterhub-spawner:User object: <User(sjoshi_motor 1/1 running)>
DEBUG:jupyterhub-spawner:User name: sjoshi_motor
DEBUG:jupyterhub-spawner:User server: Server(url=http://jupyterhub:51087/user/sjoshi_motor/, bind_url=http://*:51087/user/sjoshi_motor/)
DEBUG:jupyterhub-spawner:User state: {}
DEBUG:jupyterhub-spawner:Environment Variables:
DEBUG:jupyterhub-spawner:HOME: /home/jovyan
DEBUG:jupyterhub-spawner:NB_USER: jovyan
DEBUG:jupyterhub-spawner:==================================================
DEBUG:jupyterhub-spawner:pre_spawn_hook: user: sjoshi_motor
DEBUG:jupyterhub-spawner:==================================================
DEBUG:jupyterhub-spawner:Environment set:
DEBUG:jupyterhub-spawner:SPARKMAGIC_CONF_DIR: /home/sjoshi_motor/.sparkmagic
DEBUG:jupyterhub-spawner:JUPYTER_ENABLE_LAB: yes
DEBUG:jupyterhub-spawner:USER: sjoshi_motor
DEBUG:jupyterhub-spawner:HOME: /home/sjoshi_motor
DEBUG:jupyterhub-spawner:Config file created and permissions set

ERROR stack:
docker exec jupyterhub tail -f /var/log/jupyter/jupyter.log

Traceback (most recent call last):
  File "/opt/mamba/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/mamba/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/mamba/lib/python3.9/site-packages/sparkmagic/kernels/pysparkkernel/pysparkkernel.py", line 37, in <module>
    IPKernelApp.launch_instance(kernel_class=PySparkKernel)
  File "/opt/mamba/lib/python3.9/site-packages/traitlets/config/application.py", line 1042, in launch_instance
    app.initialize(argv)
  File "/opt/mamba/lib/python3.9/site-packages/traitlets/config/application.py", line 113, in inner
    return method(app, *args, **kwargs)
  File "/opt/mamba/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 589, in initialize
    self.init_kernel()
  File "/opt/mamba/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 444, in init_kernel
    kernel = kernel_factory(parent=self, session=self.session,
  File "/opt/mamba/lib/python3.9/site-packages/traitlets/config/configurable.py", line 551, in instance
    inst = cls(*args, **kwargs)
  File "/opt/mamba/lib/python3.9/site-packages/sparkmagic/kernels/pysparkkernel/pysparkkernel.py", line 23, in __init__
    super(PySparkKernel, self).__init__(
  File "/opt/mamba/lib/python3.9/site-packages/sparkmagic/kernels/wrapperkernel/sparkkernelbase.py", line 99, in __init__
    self.logger = SparkLog("{}_jupyter_kernel".format(self.session_language))
  File "/opt/mamba/lib/python3.9/site-packages/sparkmagic/utils/sparklogger.py", line 10, in __init__
    super(SparkLog, self).__init__(
  File "/opt/mamba/lib/python3.9/site-packages/hdijupyterutils/log.py", line 14, in __init__
    logging.config.dictConfig(logging_config)
  File "/opt/mamba/lib/python3.9/logging/config.py", line 809, in dictConfig
    dictConfigClass(config).configure()
  File "/opt/mamba/lib/python3.9/logging/config.py", line 571, in configure
    raise ValueError('Unable to configure handler '
ValueError: Unable to configure handler 'magicsHandler'
[I 2025-04-29 06:50:41.810 SingleUserNotebookApp restarter:136] KernelRestarter: restarting kernel (5/5), new random ports
[D 2025-04-29 06:50:41.821 SingleUserNotebookApp manager:438] Starting kernel: ['/opt/mamba/bin/python', '-m', 'sparkmagic.kernels.pysparkkernel.pysparkkernel', '-f', '/home/sjoshi_motor/.local/share/jupyter/runtime/kernel-c72a220c-12c0-4d6e-96f4-fa1db75ae438.json']
[D 2025-04-29 06:50:41.828 SingleUserNotebookApp connect:653] Connecting to: tcp://127.0.0.1:60047
Traceback (most recent call last):
  File "/opt/mamba/lib/python3.9/logging/config.py", line 564, in configure
    handler = self.configure_handler(handlers[name])
  File "/opt/mamba/lib/python3.9/logging/config.py", line 745, in configure_handler
    result = factory(**kwargs)
  File "/opt/mamba/lib/python3.9/site-packages/hdijupyterutils/filehandler.py", line 22, in __init__
    super(MagicsFileHandler, self).__init__(
  File "/opt/mamba/lib/python3.9/logging/__init__.py", line 1146, in __init__
    StreamHandler.__init__(self, self._open())
  File "/opt/mamba/lib/python3.9/logging/__init__.py", line 1175, in _open
    return open(self.baseFilename, self.mode, encoding=self.encoding,
PermissionError: [Errno 13] Permission denied: '/home/yuvraj/.sparkmagic/logs/log_ba3a4407-e64d-496b-ae07-48a25979ffe2.log'

Sometimes, when we just restart the server on user webserver, it works somehow.
We are 2 buddies trying to figure this out, but can’t so far.

Could anybody help with this, if there is something wrong with config file or to try something better?