Python3 (and six) don't provide string

1c7 · 2020-12-21T14:08:56Z

come from coursera-dl/coursera-dl#778
this works perfectly!

Saksham2k1 · 2021-01-04T15:20:24Z

how to use it. i am using coursera-dl

1c7 · 2021-01-05T13:22:54Z

edit edx_dl/utils.py file, just like this commit did. copy & paste these code to correct position @Saksham2k1

yoshieki1992 · 2021-02-04T08:07:10Z

it works. Thank you

bqiu86 · 2021-06-12T02:49:57Z

worked for me! thanks!

VenkatsQuest · 2021-07-26T18:17:22Z

Great Fix , thanks it worked for me too.

CaptainJamesMaximus · 2021-08-11T18:18:41Z

edit edx_dl/utils.py file, just like this commit did. copy & paste these code to correct position @Saksham2k1

How do I locate this file in the system files?

joejony · 2021-09-08T00:08:20Z

did those changes and still didn't fix it (windows).

In the suggested function
if sys.version_info[0] >= 3: import html else: from six.moves import html_parser html = html_parser.HTMLParser()

I get: "Code is unreachable Pylance"

Here is how my utils.py file is:

`# -- coding: utf-8 --
"""
This module provides utility functions that are used within the script.
"""

import six
import sys
if sys.version_info[0] >= 3:
import html
else:
from six import html_parser
html = html_parser.HTMLParser()
import os
import re
import time
import json
import errno
import random
import string
import logging
import datetime
from bs4 import BeautifulSoup as BeautifulSoup_
from xml.sax.saxutils import escape, unescape
from six import iteritems

if six.PY3: # pragma: no cover
from urllib.parse import urlparse, urljoin
else:
from urlparse import urlparse, urljoin

Python3 (and six) don't provide string

if six.PY3:
from string import ascii_letters as string_ascii_letters
from string import digits as string_digits
else:
from string import letters as string_ascii_letters
from string import digits as string_digits

from .define import COURSERA_URL, WINDOWS_UNC_PREFIX

Force us of bs4 with html.parser

def BeautifulSoup(page): return BeautifulSoup_(page, 'html.parser')

if six.PY2:
def decode_input(x):
stdin_encoding = sys.stdin.encoding
if stdin_encoding is None:
stdin_encoding = "UTF-8"
return x.decode(stdin_encoding)
else:
def decode_input(x):
return x

def spit_json(obj, filename):
with open(filename, 'w') as file_object:
json.dump(obj, file_object, indent=4)

def slurp_json(filename):
with open(filename) as file_object:
return json.load(file_object)

def is_debug_run():
"""
Check whether we're running with DEBUG loglevel.

@return: True if running with DEBUG loglevel.
@rtype: bool
"""
return logging.getLogger().isEnabledFor(logging.DEBUG)

def random_string(length):
"""
Return a pseudo-random string of specified length.
"""
valid_chars = string_ascii_letters + string_digits

return ''.join(random.choice(valid_chars) for i in range(length))

Taken from: https://wiki.python.org/moin/EscapingHtml

escape() and unescape() takes care of &, < and >.

HTML_ESCAPE_TABLE = {
'"': """,
"'": "'"
}

HTML_UNESCAPE_TABLE = dict((v, k) for k, v in HTML_ESCAPE_TABLE.items())

def unescape_html(s):
h = html_parser.HTMLParser()
s = h.unescape(s)
s = unquote_plus(s)
return unescape(s, HTML_UNESCAPE_TABLE)

def clean_filename(s, minimal_change=False):
"""
Sanitize a string to be used as a filename.

If minimal_change is set to true, then we only strip the bare minimum of
characters that are problematic for filesystems (namely, ':', '/' and
'\x00', '\n').
"""

# First, deal with URL encoded strings
h = html
s = h.unescape(s)
s = unquote_plus(s)

# Strip forbidden characters
# https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx
s = (
    s.replace(':', '-')
    .replace('/', '-')
    .replace('<', '-')
    .replace('>', '-')
    .replace('"', '-')
    .replace('\\', '-')
    .replace('|', '-')
    .replace('?', '-')
    .replace('*', '-')
    .replace('\x00', '-')
    .replace('\n', ' ')
)

# Remove trailing dots and spaces; forbidden on Windows
s = s.rstrip(' .')

if minimal_change:
    return s

s = s.replace('(', '').replace(')', '')
s = s.rstrip('.')  # Remove excess of trailing dots

s = s.strip().replace(' ', '_')
valid_chars = '-_.()%s%s' % (string.ascii_letters, string.digits)
return ''.join(c for c in s if c in valid_chars)

def normalize_path(path):
"""
Normalizes path on Windows OS. This means prepending
? to the path to get access to
Win32 device namespace instead of Win32 file namespace.
See https://msdn.microsoft.com/en-us/library/aa365247%28v=vs.85%29.aspx#maxpath

@param path: Path to normalize.
@type path: str

@return: Normalized path.
@rtype str
"""
if sys.platform != 'win32':
    return path

if path.startswith(WINDOWS_UNC_PREFIX):
    return path

return WINDOWS_UNC_PREFIX + os.path.abspath(path)

def get_anchor_format(a):
"""
Extract the resource file-type format from the anchor.
"""

# (. or format=) then (file_extension) then (? or $)
# e.g. "...format=txt" or "...download.mp4?..."
fmt = re.search(r"(?:\.|format=)(\w+)(?:\?.*)?$", a)
return fmt.group(1) if fmt else None

def mkdir_p(path, mode=0o777):
"""
Create subdirectory hierarchy given in the paths argument.
"""

try:
    os.makedirs(path, mode)
except OSError as exc:
    if exc.errno == errno.EEXIST and os.path.isdir(path):
        pass
    else:
        raise

def clean_url(url):
"""
Remove params, query and fragment parts from URL so that os.path.basename
and os.path.splitext can work correctly.

@param url: URL to clean.
@type url: str

@return: Cleaned URL.
@rtype: str
"""
parsed = urlparse(url.strip())
reconstructed = ParseResult(
    parsed.scheme, parsed.netloc, parsed.path,
    params='', query='', fragment='')
return reconstructed.geturl()

def fix_url(url):
"""
Strip whitespace characters from the beginning and the end of the url
and add a default scheme.
"""
if url is None:
return None

url = url.strip()

if url and not urlparse(url).scheme:
    url = "http://" + url

return url

def is_course_complete(last_update):
"""
Determine is the course is likely to have been terminated or not.

We return True if the timestamp given by last_update is 30 days or older
than today's date.  Otherwise, we return True.

The intended use case for this is to detect if a given courses has not
seen any update in the last 30 days or more.  Otherwise, we return True,
since it is probably too soon to declare the course complete.
"""
rv = False
if last_update >= 0:
    delta = time.time() - last_update
    max_delta = total_seconds(datetime.timedelta(days=30))
    if delta > max_delta:
        rv = True
return rv

def total_seconds(td):
"""
Compute total seconds for a timedelta.

Added for backward compatibility, pre 2.7.
"""
return (td.microseconds +
        (td.seconds + td.days * 24 * 3600) * 10 ** 6) // 10 ** 6

def make_coursera_absolute_url(url):
"""
If given url is relative adds coursera netloc,
otherwise returns it without any changes.
"""

if not bool(urlparse(url).netloc):
    return urljoin(COURSERA_URL, url)

return url

def extend_supplement_links(destination, source):
"""
Extends (merges) destination dictionary with supplement_links
from source dictionary. Values are expected to be lists, or any
data structure that has extend method.

@param destination: Destination dictionary that will be extended.
@type destination: @see CourseraOnDemand._extract_links_from_text

@param source: Source dictionary that will be used to extend
    destination dictionary.
@type source: @see CourseraOnDemand._extract_links_from_text
"""
for key, value in iteritems(source):
    if key not in destination:
        destination[key] = value
    else:
        destination[key].extend(value)

def print_ssl_error_message(exception):
"""
Print SSLError message with URL to instructions on how to fix it.
"""
message = """
#####################################################################

ATTENTION! PLEASE READ THIS!

The following error has just occurred:

%s %s

Please read instructions on how to fix this error here:

https://github.com/coursera-dl/coursera-dl#sslerror-errno-1-_sslc504-error14094410ssl-routinesssl3_read_bytessslv3-alert-handshake-failure

#####################################################################
""" % (type(exception).name, str(exception))
logging.error(message)`

aliarabbasi5155 · 2021-12-06T16:05:32Z

edit edx_dl/utils.py file, just like this commit did. copy & paste these code to correct position @Saksham2k1

Thanks, Worked for me...

leonardo73-max · 2022-01-09T15:43:43Z

Can you please give instructions on how to paste it? I can't wrap my head around it, I got notepad++ of course but Idk where to paste

shwhsx · 2022-05-14T21:15:53Z

I updated the util.py file and still get the same error: "File "/XXX/env/lib/python3.9/site-packages/coursera/utils.py", line 118, in clean_filename
s = h.unescape(s)
AttributeError: 'HTMLParser' object has no attribute 'unescape'"

The line s = h.unescape(s) is not on 118 and I already set h = html before that line. Any suggestions? Thanks!

//I figured it out. I revised the util file from previous download. After I fixed the correct file, it worked.

junaid1990 · 2022-05-19T11:18:08Z

problem not resolved

jonathann19 · 2022-09-22T17:49:36Z

thanks a lot, it worked for me!

imchamodi · 2023-09-18T06:44:09Z

Thanks a lot, it worked for me too!

amch-med23 · 2024-02-05T02:23:20Z

Thanks, this works fine, i had the same error while trying to install flasgger package on python3.10, i edited /usr/local/lib/python3.10/dist-packages/setuptool/py33compat.py , and i applied the fix and everything works now. I guess the error is caused by html_parser.HTMLParser() which doesn't exist in sys.version_info < 3.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Commit

There are no files selected for viewing

15 comments on commit `5490a99`

1c7 commented on `5490a99` Dec 21, 2020

Saksham2k1 commented on `5490a99` Jan 4, 2021

1c7 commented on `5490a99` Jan 5, 2021

yoshieki1992 commented on `5490a99` Feb 4, 2021

bqiu86 commented on `5490a99` Jun 12, 2021

VenkatsQuest commented on `5490a99` Jul 26, 2021

CaptainJamesMaximus commented on `5490a99` Aug 11, 2021 •

edited

joejony commented on `5490a99` Sep 8, 2021 •

edited

aliarabbasi5155 commented on `5490a99` Dec 6, 2021

leonardo73-max commented on `5490a99` Jan 9, 2022

shwhsx commented on `5490a99` May 14, 2022 •

edited

junaid1990 commented on `5490a99` May 19, 2022

jonathann19 commented on `5490a99` Sep 22, 2022

imchamodi commented on `5490a99` Sep 18, 2023

amch-med23 commented on `5490a99` Feb 5, 2024

Commit

There are no files selected for viewing

15 comments on commit 5490a99

1c7 commented on 5490a99 Dec 21, 2020

Choose a reason for hiding this comment

Saksham2k1 commented on 5490a99 Jan 4, 2021

Choose a reason for hiding this comment

1c7 commented on 5490a99 Jan 5, 2021

Choose a reason for hiding this comment

yoshieki1992 commented on 5490a99 Feb 4, 2021

Choose a reason for hiding this comment

bqiu86 commented on 5490a99 Jun 12, 2021

Choose a reason for hiding this comment

VenkatsQuest commented on 5490a99 Jul 26, 2021

Choose a reason for hiding this comment

CaptainJamesMaximus commented on 5490a99 Aug 11, 2021 • edited

Choose a reason for hiding this comment

joejony commented on 5490a99 Sep 8, 2021 • edited

Choose a reason for hiding this comment

Python3 (and six) don't provide string

Force us of bs4 with html.parser

Taken from: https://wiki.python.org/moin/EscapingHtml

escape() and unescape() takes care of &, < and >.

ATTENTION! PLEASE READ THIS!

The following error has just occurred:

%s %s

Please read instructions on how to fix this error here:

https://github.com/coursera-dl/coursera-dl#sslerror-errno-1-_sslc504-error14094410ssl-routinesssl3_read_bytessslv3-alert-handshake-failure

aliarabbasi5155 commented on 5490a99 Dec 6, 2021

Choose a reason for hiding this comment

leonardo73-max commented on 5490a99 Jan 9, 2022

Choose a reason for hiding this comment

shwhsx commented on 5490a99 May 14, 2022 • edited

Choose a reason for hiding this comment

junaid1990 commented on 5490a99 May 19, 2022

Choose a reason for hiding this comment

jonathann19 commented on 5490a99 Sep 22, 2022

Choose a reason for hiding this comment

imchamodi commented on 5490a99 Sep 18, 2023

Choose a reason for hiding this comment

amch-med23 commented on 5490a99 Feb 5, 2024

Choose a reason for hiding this comment

15 comments on commit `5490a99`

1c7 commented on `5490a99` Dec 21, 2020

Saksham2k1 commented on `5490a99` Jan 4, 2021

1c7 commented on `5490a99` Jan 5, 2021

yoshieki1992 commented on `5490a99` Feb 4, 2021

bqiu86 commented on `5490a99` Jun 12, 2021

VenkatsQuest commented on `5490a99` Jul 26, 2021

CaptainJamesMaximus commented on `5490a99` Aug 11, 2021 •

edited

joejony commented on `5490a99` Sep 8, 2021 •

edited

aliarabbasi5155 commented on `5490a99` Dec 6, 2021

leonardo73-max commented on `5490a99` Jan 9, 2022

shwhsx commented on `5490a99` May 14, 2022 •

edited

junaid1990 commented on `5490a99` May 19, 2022

jonathann19 commented on `5490a99` Sep 22, 2022

imchamodi commented on `5490a99` Sep 18, 2023

amch-med23 commented on `5490a99` Feb 5, 2024