From 05330154d0737226e8aa928921ebabf4b4505849 Mon Sep 17 00:00:00 2001
From: Vidar Tonaas Fauske <vidartf@gmail.com>
Date: Tue, 13 Nov 2018 13:45:14 +0100
Subject: [PATCH 1/3] Handle missing expected errors better

- Better error message for missing error in output compare.
- Fail cell if an unrun cell with an expected error does not produce one.
---
 nbval/plugin.py                   | 43 ++++++++++++++++++++++---------
 tests/test_expected_exceptions.py | 43 +++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_expected_exceptions.py

diff --git a/nbval/plugin.py b/nbval/plugin.py
index 4200ec8..0579c7a 100644
--- a/nbval/plugin.py
+++ b/nbval/plugin.py
@@ -428,12 +428,22 @@ def compare_outputs(self, test, ref, skip_compare=None):
         test_keys = set(testing_outs.keys())
 
         if ref_keys - test_keys:
-            self.comparison_traceback.append(
-                cc.FAIL
-                + "Missing output fields from running code: %s"
-                % (ref_keys - test_keys)
-                + cc.ENDC
-            )
+            if ref_keys == {'evalue', 'ename'}:
+                self.comparison_traceback.append(
+                    cc.FAIL
+                    + "Expected error:\n  %s: %r" % (
+                        '\n'.join(reference_outs['ename']),
+                        '\n'.join(reference_outs['evalue'])
+                    )
+                    + cc.ENDC
+                )
+            else:
+                self.comparison_traceback.append(
+                    cc.FAIL
+                    + "Missing output fields from running code: %s"
+                    % (ref_keys - test_keys)
+                    + cc.ENDC
+                )
             return False
         elif test_keys - ref_keys:
             self.comparison_traceback.append(
@@ -582,6 +592,13 @@ def runtest(self):
         # TODO: Only store if comparing with nbdime, to save on memory usage
         self.test_outputs = outs
 
+        # Cells where the reference is not run, will not check outputs:
+        unrun = self.cell.execution_count is None
+        if unrun and self.cell.outputs:
+            self.raise_cell_error('Unrun reference cell has outputs')
+
+        cell_has_error = False
+
         # Now get the outputs from the iopub channel
         while True:
             # The iopub channel broadcasts a range of messages. We keep reading
@@ -692,6 +709,7 @@ def runtest(self):
             # cell execution. Therefore raise a cell error and pass the
             # traceback information.
             elif msg_type == 'error':
+                cell_has_error = True
                 # Store error in output first
                 out['ename'] = reply['ename']
                 out['evalue'] = reply['evalue']
@@ -700,9 +718,9 @@ def runtest(self):
                 if not self.options['check_exception']:
                     # Ensure we flush iopub before raising error
                     try:
-                        self.parent.kernel.await_idle(msg_id, self.output_timeout)
+                        kernel.await_idle(msg_id, self.output_timeout)
                     except Empty:
-                        self.stop()
+                        kernel.stop()
                         raise RuntimeError('Timed out waiting for idle kernel!')
                     traceback = '\n' + '\n'.join(reply['traceback'])
                     if out['ename'] == 'KeyboardInterrupt' and self.parent.timed_out:
@@ -718,10 +736,11 @@ def runtest(self):
 
         outs[:] = coalesce_streams(outs)
 
-        # Cells where the reference is not run, will not check outputs:
-        unrun = self.cell.execution_count is None
-        if unrun and self.cell.outputs:
-            self.raise_cell_error('Unrun reference cell has outputs')
+        if self.options['check_exception'] and unrun and not cell_has_error:
+            # If unrun, we cannot rely on output comparison for checking errors
+            self.raise_cell_error(
+                "Expected error",
+                "Expected cell to produce an error, but none was produced!")
 
         # Compare if the outputs have the same number of lines
         # and throw an error if it fails
diff --git a/tests/test_expected_exceptions.py b/tests/test_expected_exceptions.py
new file mode 100644
index 0000000..c281e4a
--- /dev/null
+++ b/tests/test_expected_exceptions.py
@@ -0,0 +1,43 @@
+import os
+
+import nbformat
+import pytest
+
+from utils import build_nb
+
+
+pytest_plugins = "pytester"
+
+
+def test_unrun_raises(testdir):
+    # This test uses the testdir fixture from pytester, which is useful for
+    # testing pytest plugins. It writes a notebook to a temporary dir
+    # and then runs pytest.
+
+    # Setup notebook to test:
+    sources = [
+        # In [1]:
+        "raise ValueError('foo')",
+    ]
+    # Build unrun notebook:
+    nb = build_nb(sources, mark_run=False)
+
+    # Write notebook to test dir
+    nbformat.write(nb, os.path.join(
+        str(testdir.tmpdir), 'test_expcted_exceptions.ipynb'))
+
+    # Run tests
+    result = testdir.inline_run('--nbval', '--current-env', '-s')
+    reports = result.getreports('pytest_runtest_logreport')
+
+    # Setup and teardown of cells should have no issues:
+    setup_teardown = [r for r in reports if r.when != 'call']
+    for r in setup_teardown:
+        assert r.passed
+
+    reports = [r for r in reports if r.when == 'call']
+
+    assert len(reports) == 1
+
+    # First cell should fail, unexpectedly
+    assert reports[0].failed and not hasattr(reports[0], 'wasxfail')

From a124c502999c44e3460a8ffe14fa23e94b997f61 Mon Sep 17 00:00:00 2001
From: Vidar Tonaas Fauske <vidartf@gmail.com>
Date: Tue, 13 Nov 2018 14:44:26 +0100
Subject: [PATCH 2/3] Fix/expand tests for expected failures

---
 tests/test_expected_exceptions.py | 72 +++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 13 deletions(-)

diff --git a/tests/test_expected_exceptions.py b/tests/test_expected_exceptions.py
index c281e4a..d949d3b 100644
--- a/tests/test_expected_exceptions.py
+++ b/tests/test_expected_exceptions.py
@@ -9,7 +9,7 @@
 pytest_plugins = "pytester"
 
 
-def test_unrun_raises(testdir):
+def test_run_raises(testdir):
     # This test uses the testdir fixture from pytester, which is useful for
     # testing pytest plugins. It writes a notebook to a temporary dir
     # and then runs pytest.
@@ -17,27 +17,73 @@ def test_unrun_raises(testdir):
     # Setup notebook to test:
     sources = [
         # In [1]:
-        "raise ValueError('foo')",
+        "",   # No error produced, when one is expected
+        # In [2]:
+        "raise ValueError('foo')",   # Wrong ename
+        # In [3]:
+        "raise ValueError('foo')",   # Wrong evalue
     ]
     # Build unrun notebook:
-    nb = build_nb(sources, mark_run=False)
+    nb = build_nb(sources, mark_run=True)
+
+    nb.cells[0].metadata.tags = ['raises-exception']
+    nb.cells[0].outputs.append(
+        nbformat.v4.new_output(
+            'error',
+            ename='ValueError',
+            evalue='foo',
+            traceback=['foobar', 'bob'],  # Should be ignored
+        )
+    )
+
+    nb.cells[1].metadata.tags = ['raises-exception']
+    nb.cells[1].outputs.append(
+        nbformat.v4.new_output(
+            'error',
+            ename='TypeError',   # Expected TypeError, got ValueError
+            evalue='foo',
+            traceback=['foobar', 'bob'],  # Should be ignored
+        )
+    )
+
+    nb.cells[2].metadata.tags = ['raises-exception']
+    nb.cells[2].outputs.append(
+        nbformat.v4.new_output(
+            'error',
+            ename='ValueError',
+            evalue='bar',   # Expected bar, got foo
+            traceback=['foobar', 'bob'],  # Should be ignored
+        )
+    )
 
     # Write notebook to test dir
     nbformat.write(nb, os.path.join(
         str(testdir.tmpdir), 'test_expcted_exceptions.ipynb'))
 
     # Run tests
-    result = testdir.inline_run('--nbval', '--current-env', '-s')
-    reports = result.getreports('pytest_runtest_logreport')
+    result = testdir.runpytest_subprocess('--nbval', '--current-env', '-s')
+    result.assert_outcomes(failed=3)
 
-    # Setup and teardown of cells should have no issues:
-    setup_teardown = [r for r in reports if r.when != 'call']
-    for r in setup_teardown:
-        assert r.passed
 
-    reports = [r for r in reports if r.when == 'call']
 
-    assert len(reports) == 1
+def test_unrun_raises(testdir):
+    # This test uses the testdir fixture from pytester, which is useful for
+    # testing pytest plugins. It writes a notebook to a temporary dir
+    # and then runs pytest.
 
-    # First cell should fail, unexpectedly
-    assert reports[0].failed and not hasattr(reports[0], 'wasxfail')
+    # Setup notebook to test:
+    sources = [
+        # In [1]:
+        "pass",
+    ]
+    # Build unrun notebook:
+    nb = build_nb(sources, mark_run=False)
+    nb.cells[0].metadata.tags = ['raises-exception']
+
+    # Write notebook to test dir
+    nbformat.write(nb, os.path.join(
+        str(testdir.tmpdir), 'test_expcted_exceptions.ipynb'))
+
+    # Run tests
+    result = testdir.runpytest_subprocess('--nbval', '--current-env', '-s')
+    result.assert_outcomes(failed=1)

From cdff477c9ba4c47ebcc4142df5800dcdefe77bdc Mon Sep 17 00:00:00 2001
From: Vidar Tonaas Fauske <vidartf@gmail.com>
Date: Tue, 13 Nov 2018 14:58:56 +0100
Subject: [PATCH 3/3] Cleanup

---
 nbval/plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nbval/plugin.py b/nbval/plugin.py
index 0579c7a..104e0ca 100644
--- a/nbval/plugin.py
+++ b/nbval/plugin.py
@@ -580,7 +580,7 @@ def runtest(self):
 
         # Poll the shell channel to get a message
         try:
-            self.parent.kernel.await_reply(msg_id, timeout=timeout)
+            kernel.await_reply(msg_id, timeout=timeout)
         except Empty:  # Timeout reached
             # Try to interrupt kernel, as this will give us traceback:
             kernel.interrupt()