Setting up Jupyterhub with sparkmagic plugin

uvca8 · April 29, 2025, 7:49am

Hi,
We are trying to setup jupyterhub on EMR.
We’ve customized the jupyterhub_config file for sparkmagic plugin to work for each user.

Scenario:

user sjoshi_motor is trying to start a pyspark session on jupyerhub notebook,
but it tries to access /home/yuvraj/.sparkmagic (another user’s folder who is already logged in) and gives permission error (which it should!)

Jupyterhub is running on the master node of the cluster,
inside a docker container provided by EMR themselves.

#jupyterhub_config.py
# Configuration file for jupyterhub.

import os
import logging

notebook_dir = os.environ.get('DOCKER_NOTEBOOK_DIR')
network_name='jupyterhub-network'
s3_endpoint_url = os.environ.get('S3_ENDPOINT_URL', 's3.amazonaws.com')

c.Spawner.debug = True
#c.Spawner.environment = {'SPARKMAGIC_CONF_DIR':'/etc/jupyter/conf', 'JUPYTER_ENABLE_LAB': 'yes', 'S3_ENDPOINT_URL': s3_endpoint_url}

c.JupyterHub.hub_ip = '0.0.0.0'
c.JupyterHub.admin_access = True
c.JupyterHub.ssl_key = '/etc/jupyter/conf/server.key'
c.JupyterHub.ssl_cert = '/etc/jupyter/conf/server.crt'
c.JupyterHub.port = 9443

c.Authenticator.admin_users = {'jovyan'}

#USER ADDED


def get_sparkmagic_config(hostname, username):

    home_dir = os.path.expanduser(f'~{username}')
    sparkmagic_dir = os.path.join(home_dir, '.sparkmagic')
    
    config = {
        "kernel_python_credentials": {
            "username": "",
            "password": "",
            "url": f"http://{hostname}:8998",
            "auth": "None"
        },
        "kernel_scala_credentials": {
            "username": "",
            "password": "",
            "url": f"http://{hostname}:8998",
            "auth": "None"
        },
        "kernel_r_credentials": {
            "username": "",
            "password": "",
            "url": f"http://{hostname}:8998"
        },
        "logging_config": {
            "version": 1,
            "formatters": {
                "magicsFormatter": {
                    "format": "%(asctime)s\t%(levelname)s\t%(message)s",
                    "datefmt": ""
                }
            },
            "handlers": {
                "magicsHandler": {
                    "class": "hdijupyterutils.filehandler.MagicsFileHandler",
                    "formatter": "magicsFormatter",
                    "home_path": sparkmagic_dir
                }
            },
            "loggers": {
                "magicsLogger": {
                    "handlers": ["magicsHandler"],
                    "level": "DEBUG",
                    "propagate": 0
                }
            }
        },
        "wait_for_idle_timeout_seconds": 1200,
        "livy_session_startup_timeout_seconds": 320,
        "fatal_error_suggestion": "The code failed because of a fatal error:\n\t{}.\n\nSome things to try:\na) Make sure Spark has enough available resources for Jupyter to create a Spark context.\nb) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.\nc) Restart the kernel.",
        "ignore_ssl_errors": "false",
        "session_configs": {
            "driverMemory": "2G",
            "executorCores": 2,
            "executorMemory": "2G",
            "numExecutors": 6,
            "conf": {
                "spark.jars.packages": "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0,org.apache.spark:spark-avro_2.13:3.5.1,org.apache.httpcomponents.client5:httpclient5:5.4.1,com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.41.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1",
                "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
                "spark.sql.hive.convertMetastoreParquet": "true",
                "spark.hadoop.mapreduce.input.pathFilter.class": "org.apache.hudi.hadoop.HoodieROTablePathFilter",
                "spark.dynamicAllocation.enabled": "false",
                "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog",
                "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension",
                "spark.databricks.delta.schema.autoMerge": "true",
                "spark.databricks.delta.retentionDurationCheck.enabled": "false",
                "spark.databricks.delta.merge.repartitionBeforeWrite.enabled": "true",
                "spark.sql.broadcastTimeout": 36000,
                "spark.app.name": f'jupyterhub_session_{username}',
                "spark.sql.legacy.allowNonEmptyLocationInCTAS": "true",
                "spark.sql.legacy.parquet.datetimeRebaseModeInWrite": "CORRECTED",
                "spark.sql.legacy.parquet.datetimeRebaseModeInRead": "LEGACY",
                "spark.sql.legacy.timeParserPolicy": "LEGACY"
            }
        },
        "use_auto_viz": "true",
        "coerce_dataframe": "true",
        "max_results_sql": 2500,
        "pyspark_dataframe_encoding": "utf-8",
        "heartbeat_refresh_seconds": 30,
        "livy_server_heartbeat_timeout_seconds": 0,
        "heartbeat_retry_seconds": 10,
        "server_extension_default_kernel_name": "pysparkkernel",
        "custom_headers": {},
        "retry_policy": "configurable",
        "retry_seconds_to_sleep_list": [
            0.2,
            0.5,
            1.0,
            3.0,
            5.0
        ],
        "configurable_retry_policy_max_retries": 8
    }

    return config

def pre_spawn_hook(spawner):
    import os
    import json

    logging.basicConfig(level=logging.DEBUG)
    logger = logging.getLogger('jupyterhub-spawner')

    # Log all spawner attributes
    logger.debug('='*50)
    logger.debug('Spawner Debug Information:')
    logger.debug(f'Spawner type: {type(spawner)}')
    logger.debug(f'Spawner attributes: {dir(spawner)}')
    logger.debug(f'User object: {spawner.user}')
    logger.debug(f'User name: {spawner.user.name}')
    logger.debug(f'User server: {spawner.user.server}')
    logger.debug(f'User state: {spawner.user.state}')

    # Log environment variables
    logger.debug('Environment Variables:')
    for key, value in os.environ.items():
        if 'USER' in key or 'HOME' in key:
            logger.debug(f'{key}: {value}')


    # Create sparkmagic config directory if it doesn't exist
    print('$'* 30)
    print('pre_spawn_hook: user:',spawner.user.name)
    sparkmagic_dir = os.path.join('/home', spawner.user.name, '.sparkmagic')
    os.makedirs(sparkmagic_dir, exist_ok=True)

    logger.debug('='*50)
    logger.debug(f'pre_spawn_hook: user: {spawner.user.name}')

    c.Spawner.environment = {
            'SPARKMAGIC_CONF_DIR': sparkmagic_dir, 
            'JUPYTER_ENABLE_LAB': 'yes', 
            'S3_ENDPOINT_URL': s3_endpoint_url, 
            'USER': spawner.user.name, 
            'HOME': f'/home/{spawner.user.name}'
        }

    logger.debug('='*50)
    logger.debug('Environment set:')
    for key, value in c.Spawner.environment.items():
        logger.debug(f'{key}: {value}')

    # Write sparkmagic config
    hostname='emr_url'
    config = get_sparkmagic_config(hostname, spawner.user.name)
    with open(os.path.join(sparkmagic_dir, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4)
        os.system(f'chown -R {spawner.user.name}:users {sparkmagic_dir}')

    logger.debug('Config file created and permissions set')
    logger.debug('='*50)

c.Spawner.pre_spawn_hook = pre_spawn_hook

c.Spawner.mem_limit = '512M'
c.Spawner.cpu_limit = 1

We logged some variables because we thought env variables are not set right, but they are:

DEBUG:jupyterhub-spawner:User object: <User(sjoshi_motor 1/1 running)>
DEBUG:jupyterhub-spawner:User name: sjoshi_motor
DEBUG:jupyterhub-spawner:User server: Server(url=http://jupyterhub:51087/user/sjoshi_motor/, bind_url=http://*:51087/user/sjoshi_motor/)
DEBUG:jupyterhub-spawner:User state: {}
DEBUG:jupyterhub-spawner:Environment Variables:
DEBUG:jupyterhub-spawner:HOME: /home/jovyan
DEBUG:jupyterhub-spawner:NB_USER: jovyan
DEBUG:jupyterhub-spawner:==================================================
DEBUG:jupyterhub-spawner:pre_spawn_hook: user: sjoshi_motor
DEBUG:jupyterhub-spawner:==================================================
DEBUG:jupyterhub-spawner:Environment set:
DEBUG:jupyterhub-spawner:SPARKMAGIC_CONF_DIR: /home/sjoshi_motor/.sparkmagic
DEBUG:jupyterhub-spawner:JUPYTER_ENABLE_LAB: yes
DEBUG:jupyterhub-spawner:USER: sjoshi_motor
DEBUG:jupyterhub-spawner:HOME: /home/sjoshi_motor
DEBUG:jupyterhub-spawner:Config file created and permissions set

ERROR stack:
docker exec jupyterhub tail -f /var/log/jupyter/jupyter.log

Traceback (most recent call last):
  File "/opt/mamba/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/mamba/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/mamba/lib/python3.9/site-packages/sparkmagic/kernels/pysparkkernel/pysparkkernel.py", line 37, in <module>
    IPKernelApp.launch_instance(kernel_class=PySparkKernel)
  File "/opt/mamba/lib/python3.9/site-packages/traitlets/config/application.py", line 1042, in launch_instance
    app.initialize(argv)
  File "/opt/mamba/lib/python3.9/site-packages/traitlets/config/application.py", line 113, in inner
    return method(app, *args, **kwargs)
  File "/opt/mamba/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 589, in initialize
    self.init_kernel()
  File "/opt/mamba/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 444, in init_kernel
    kernel = kernel_factory(parent=self, session=self.session,
  File "/opt/mamba/lib/python3.9/site-packages/traitlets/config/configurable.py", line 551, in instance
    inst = cls(*args, **kwargs)
  File "/opt/mamba/lib/python3.9/site-packages/sparkmagic/kernels/pysparkkernel/pysparkkernel.py", line 23, in __init__
    super(PySparkKernel, self).__init__(
  File "/opt/mamba/lib/python3.9/site-packages/sparkmagic/kernels/wrapperkernel/sparkkernelbase.py", line 99, in __init__
    self.logger = SparkLog("{}_jupyter_kernel".format(self.session_language))
  File "/opt/mamba/lib/python3.9/site-packages/sparkmagic/utils/sparklogger.py", line 10, in __init__
    super(SparkLog, self).__init__(
  File "/opt/mamba/lib/python3.9/site-packages/hdijupyterutils/log.py", line 14, in __init__
    logging.config.dictConfig(logging_config)
  File "/opt/mamba/lib/python3.9/logging/config.py", line 809, in dictConfig
    dictConfigClass(config).configure()
  File "/opt/mamba/lib/python3.9/logging/config.py", line 571, in configure
    raise ValueError('Unable to configure handler '
ValueError: Unable to configure handler 'magicsHandler'
[I 2025-04-29 06:50:41.810 SingleUserNotebookApp restarter:136] KernelRestarter: restarting kernel (5/5), new random ports
[D 2025-04-29 06:50:41.821 SingleUserNotebookApp manager:438] Starting kernel: ['/opt/mamba/bin/python', '-m', 'sparkmagic.kernels.pysparkkernel.pysparkkernel', '-f', '/home/sjoshi_motor/.local/share/jupyter/runtime/kernel-c72a220c-12c0-4d6e-96f4-fa1db75ae438.json']
[D 2025-04-29 06:50:41.828 SingleUserNotebookApp connect:653] Connecting to: tcp://127.0.0.1:60047
Traceback (most recent call last):
  File "/opt/mamba/lib/python3.9/logging/config.py", line 564, in configure
    handler = self.configure_handler(handlers[name])
  File "/opt/mamba/lib/python3.9/logging/config.py", line 745, in configure_handler
    result = factory(**kwargs)
  File "/opt/mamba/lib/python3.9/site-packages/hdijupyterutils/filehandler.py", line 22, in __init__
    super(MagicsFileHandler, self).__init__(
  File "/opt/mamba/lib/python3.9/logging/__init__.py", line 1146, in __init__
    StreamHandler.__init__(self, self._open())
  File "/opt/mamba/lib/python3.9/logging/__init__.py", line 1175, in _open
    return open(self.baseFilename, self.mode, encoding=self.encoding,
PermissionError: [Errno 13] Permission denied: '/home/yuvraj/.sparkmagic/logs/log_ba3a4407-e64d-496b-ae07-48a25979ffe2.log'

Sometimes, when we just restart the server on user webserver, it works somehow.
We are 2 buddies trying to figure this out, but can’t so far.

Could anybody help with this, if there is something wrong with config file or to try something better?

Topic		Replies	Views
Container home replaced Zero to JupyterHub on Kubernetes how-to	4	652	February 18, 2021
Issue to connect jupyter sparkmagic kernel to kerberized livy server General how-to , help-wanted	0	2455	September 5, 2019
Jupyterhub spark spark-config.sh /usr/local/bin/before-notebook.d JupyterHub	3	1209	April 27, 2021
Integrate JupyterHub with PySpark which is running on docker container JupyterHub	1	340	November 13, 2023
DockerSpawner + Spark JupyterHub how-to	0	371	November 15, 2020

Setting up Jupyterhub with sparkmagic plugin

Related topics