Hi,
We are trying to setup jupyterhub on EMR.
We’ve customized the jupyterhub_config file for sparkmagic plugin to work for each user.
Scenario:
user sjoshi_motor is trying to start a pyspark session on jupyerhub notebook,
but it tries to access /home/yuvraj/.sparkmagic (another user’s folder who is already logged in) and gives permission error (which it should!)
Jupyterhub is running on the master node of the cluster,
inside a docker container provided by EMR themselves.
#jupyterhub_config.py
# Configuration file for jupyterhub.
import os
import logging
notebook_dir = os.environ.get('DOCKER_NOTEBOOK_DIR')
network_name='jupyterhub-network'
s3_endpoint_url = os.environ.get('S3_ENDPOINT_URL', 's3.amazonaws.com')
c.Spawner.debug = True
#c.Spawner.environment = {'SPARKMAGIC_CONF_DIR':'/etc/jupyter/conf', 'JUPYTER_ENABLE_LAB': 'yes', 'S3_ENDPOINT_URL': s3_endpoint_url}
c.JupyterHub.hub_ip = '0.0.0.0'
c.JupyterHub.admin_access = True
c.JupyterHub.ssl_key = '/etc/jupyter/conf/server.key'
c.JupyterHub.ssl_cert = '/etc/jupyter/conf/server.crt'
c.JupyterHub.port = 9443
c.Authenticator.admin_users = {'jovyan'}
#USER ADDED
def get_sparkmagic_config(hostname, username):
home_dir = os.path.expanduser(f'~{username}')
sparkmagic_dir = os.path.join(home_dir, '.sparkmagic')
config = {
"kernel_python_credentials": {
"username": "",
"password": "",
"url": f"http://{hostname}:8998",
"auth": "None"
},
"kernel_scala_credentials": {
"username": "",
"password": "",
"url": f"http://{hostname}:8998",
"auth": "None"
},
"kernel_r_credentials": {
"username": "",
"password": "",
"url": f"http://{hostname}:8998"
},
"logging_config": {
"version": 1,
"formatters": {
"magicsFormatter": {
"format": "%(asctime)s\t%(levelname)s\t%(message)s",
"datefmt": ""
}
},
"handlers": {
"magicsHandler": {
"class": "hdijupyterutils.filehandler.MagicsFileHandler",
"formatter": "magicsFormatter",
"home_path": sparkmagic_dir
}
},
"loggers": {
"magicsLogger": {
"handlers": ["magicsHandler"],
"level": "DEBUG",
"propagate": 0
}
}
},
"wait_for_idle_timeout_seconds": 1200,
"livy_session_startup_timeout_seconds": 320,
"fatal_error_suggestion": "The code failed because of a fatal error:\n\t{}.\n\nSome things to try:\na) Make sure Spark has enough available resources for Jupyter to create a Spark context.\nb) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.\nc) Restart the kernel.",
"ignore_ssl_errors": "false",
"session_configs": {
"driverMemory": "2G",
"executorCores": 2,
"executorMemory": "2G",
"numExecutors": 6,
"conf": {
"spark.jars.packages": "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0,org.apache.spark:spark-avro_2.13:3.5.1,org.apache.httpcomponents.client5:httpclient5:5.4.1,com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.41.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1",
"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
"spark.sql.hive.convertMetastoreParquet": "true",
"spark.hadoop.mapreduce.input.pathFilter.class": "org.apache.hudi.hadoop.HoodieROTablePathFilter",
"spark.dynamicAllocation.enabled": "false",
"spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog",
"spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension",
"spark.databricks.delta.schema.autoMerge": "true",
"spark.databricks.delta.retentionDurationCheck.enabled": "false",
"spark.databricks.delta.merge.repartitionBeforeWrite.enabled": "true",
"spark.sql.broadcastTimeout": 36000,
"spark.app.name": f'jupyterhub_session_{username}',
"spark.sql.legacy.allowNonEmptyLocationInCTAS": "true",
"spark.sql.legacy.parquet.datetimeRebaseModeInWrite": "CORRECTED",
"spark.sql.legacy.parquet.datetimeRebaseModeInRead": "LEGACY",
"spark.sql.legacy.timeParserPolicy": "LEGACY"
}
},
"use_auto_viz": "true",
"coerce_dataframe": "true",
"max_results_sql": 2500,
"pyspark_dataframe_encoding": "utf-8",
"heartbeat_refresh_seconds": 30,
"livy_server_heartbeat_timeout_seconds": 0,
"heartbeat_retry_seconds": 10,
"server_extension_default_kernel_name": "pysparkkernel",
"custom_headers": {},
"retry_policy": "configurable",
"retry_seconds_to_sleep_list": [
0.2,
0.5,
1.0,
3.0,
5.0
],
"configurable_retry_policy_max_retries": 8
}
return config
def pre_spawn_hook(spawner):
import os
import json
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger('jupyterhub-spawner')
# Log all spawner attributes
logger.debug('='*50)
logger.debug('Spawner Debug Information:')
logger.debug(f'Spawner type: {type(spawner)}')
logger.debug(f'Spawner attributes: {dir(spawner)}')
logger.debug(f'User object: {spawner.user}')
logger.debug(f'User name: {spawner.user.name}')
logger.debug(f'User server: {spawner.user.server}')
logger.debug(f'User state: {spawner.user.state}')
# Log environment variables
logger.debug('Environment Variables:')
for key, value in os.environ.items():
if 'USER' in key or 'HOME' in key:
logger.debug(f'{key}: {value}')
# Create sparkmagic config directory if it doesn't exist
print('$'* 30)
print('pre_spawn_hook: user:',spawner.user.name)
sparkmagic_dir = os.path.join('/home', spawner.user.name, '.sparkmagic')
os.makedirs(sparkmagic_dir, exist_ok=True)
logger.debug('='*50)
logger.debug(f'pre_spawn_hook: user: {spawner.user.name}')
c.Spawner.environment = {
'SPARKMAGIC_CONF_DIR': sparkmagic_dir,
'JUPYTER_ENABLE_LAB': 'yes',
'S3_ENDPOINT_URL': s3_endpoint_url,
'USER': spawner.user.name,
'HOME': f'/home/{spawner.user.name}'
}
logger.debug('='*50)
logger.debug('Environment set:')
for key, value in c.Spawner.environment.items():
logger.debug(f'{key}: {value}')
# Write sparkmagic config
hostname='emr_url'
config = get_sparkmagic_config(hostname, spawner.user.name)
with open(os.path.join(sparkmagic_dir, 'config.json'), 'w') as f:
json.dump(config, f, indent=4)
os.system(f'chown -R {spawner.user.name}:users {sparkmagic_dir}')
logger.debug('Config file created and permissions set')
logger.debug('='*50)
c.Spawner.pre_spawn_hook = pre_spawn_hook
c.Spawner.mem_limit = '512M'
c.Spawner.cpu_limit = 1
We logged some variables because we thought env variables are not set right, but they are:
DEBUG:jupyterhub-spawner:User object: <User(sjoshi_motor 1/1 running)>
DEBUG:jupyterhub-spawner:User name: sjoshi_motor
DEBUG:jupyterhub-spawner:User server: Server(url=http://jupyterhub:51087/user/sjoshi_motor/, bind_url=http://*:51087/user/sjoshi_motor/)
DEBUG:jupyterhub-spawner:User state: {}
DEBUG:jupyterhub-spawner:Environment Variables:
DEBUG:jupyterhub-spawner:HOME: /home/jovyan
DEBUG:jupyterhub-spawner:NB_USER: jovyan
DEBUG:jupyterhub-spawner:==================================================
DEBUG:jupyterhub-spawner:pre_spawn_hook: user: sjoshi_motor
DEBUG:jupyterhub-spawner:==================================================
DEBUG:jupyterhub-spawner:Environment set:
DEBUG:jupyterhub-spawner:SPARKMAGIC_CONF_DIR: /home/sjoshi_motor/.sparkmagic
DEBUG:jupyterhub-spawner:JUPYTER_ENABLE_LAB: yes
DEBUG:jupyterhub-spawner:USER: sjoshi_motor
DEBUG:jupyterhub-spawner:HOME: /home/sjoshi_motor
DEBUG:jupyterhub-spawner:Config file created and permissions set
ERROR stack:
docker exec jupyterhub tail -f /var/log/jupyter/jupyter.log
Traceback (most recent call last):
File "/opt/mamba/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/mamba/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/opt/mamba/lib/python3.9/site-packages/sparkmagic/kernels/pysparkkernel/pysparkkernel.py", line 37, in <module>
IPKernelApp.launch_instance(kernel_class=PySparkKernel)
File "/opt/mamba/lib/python3.9/site-packages/traitlets/config/application.py", line 1042, in launch_instance
app.initialize(argv)
File "/opt/mamba/lib/python3.9/site-packages/traitlets/config/application.py", line 113, in inner
return method(app, *args, **kwargs)
File "/opt/mamba/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 589, in initialize
self.init_kernel()
File "/opt/mamba/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 444, in init_kernel
kernel = kernel_factory(parent=self, session=self.session,
File "/opt/mamba/lib/python3.9/site-packages/traitlets/config/configurable.py", line 551, in instance
inst = cls(*args, **kwargs)
File "/opt/mamba/lib/python3.9/site-packages/sparkmagic/kernels/pysparkkernel/pysparkkernel.py", line 23, in __init__
super(PySparkKernel, self).__init__(
File "/opt/mamba/lib/python3.9/site-packages/sparkmagic/kernels/wrapperkernel/sparkkernelbase.py", line 99, in __init__
self.logger = SparkLog("{}_jupyter_kernel".format(self.session_language))
File "/opt/mamba/lib/python3.9/site-packages/sparkmagic/utils/sparklogger.py", line 10, in __init__
super(SparkLog, self).__init__(
File "/opt/mamba/lib/python3.9/site-packages/hdijupyterutils/log.py", line 14, in __init__
logging.config.dictConfig(logging_config)
File "/opt/mamba/lib/python3.9/logging/config.py", line 809, in dictConfig
dictConfigClass(config).configure()
File "/opt/mamba/lib/python3.9/logging/config.py", line 571, in configure
raise ValueError('Unable to configure handler '
ValueError: Unable to configure handler 'magicsHandler'
[I 2025-04-29 06:50:41.810 SingleUserNotebookApp restarter:136] KernelRestarter: restarting kernel (5/5), new random ports
[D 2025-04-29 06:50:41.821 SingleUserNotebookApp manager:438] Starting kernel: ['/opt/mamba/bin/python', '-m', 'sparkmagic.kernels.pysparkkernel.pysparkkernel', '-f', '/home/sjoshi_motor/.local/share/jupyter/runtime/kernel-c72a220c-12c0-4d6e-96f4-fa1db75ae438.json']
[D 2025-04-29 06:50:41.828 SingleUserNotebookApp connect:653] Connecting to: tcp://127.0.0.1:60047
Traceback (most recent call last):
File "/opt/mamba/lib/python3.9/logging/config.py", line 564, in configure
handler = self.configure_handler(handlers[name])
File "/opt/mamba/lib/python3.9/logging/config.py", line 745, in configure_handler
result = factory(**kwargs)
File "/opt/mamba/lib/python3.9/site-packages/hdijupyterutils/filehandler.py", line 22, in __init__
super(MagicsFileHandler, self).__init__(
File "/opt/mamba/lib/python3.9/logging/__init__.py", line 1146, in __init__
StreamHandler.__init__(self, self._open())
File "/opt/mamba/lib/python3.9/logging/__init__.py", line 1175, in _open
return open(self.baseFilename, self.mode, encoding=self.encoding,
PermissionError: [Errno 13] Permission denied: '/home/yuvraj/.sparkmagic/logs/log_ba3a4407-e64d-496b-ae07-48a25979ffe2.log'
Sometimes, when we just restart the server on user webserver, it works somehow.
We are 2 buddies trying to figure this out, but can’t so far.
Could anybody help with this, if there is something wrong with config file or to try something better?