Review of get_file_by_url()

Please note that this patch also introduce
which is in charge of converting a domain in a line into
IDNA and/or UTF-8 format.

Also note the introduction of BeautifulSoup() which helps
us to decode data from the downloaded URL.

Fixes (issue(s)/protocol(s) I was able to reproduce):
 * https://github.com/StevenBlack/hosts/issues/514#issuecomment-368932152

Possible fix of (issue(s)/protocol(s) I wasn't able to reproduce):
 * https://github.com/StevenBlack/hosts/issues/514#issue-300048106
 * https://github.com/StevenBlack/hosts/issues/494#issue-296166492
 * https://github.com/StevenBlack/hosts/issues/420#issue-267453114
 * https://github.com/StevenBlack/hosts/issues/372#issue-246927047
 * https://github.com/StevenBlack/hosts/issues/382#issuecomment-322010562
This commit is contained in:
funilrys
2018-02-28 23:06:58 +01:00
parent 37da299efb
commit c82f691952

View File

@ -6,23 +6,26 @@
# This Python script will combine all the host files you provide
# as sources into one, unique host file to keep you internet browsing happy.
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from glob import glob
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import os
import argparse
import fnmatch
import json
import locale
import os
import platform
import re
import shutil
import socket
import subprocess
import sys
import tempfile
import time
import fnmatch
import argparse
import socket
import json
from glob import glob
import lxml
from bs4 import BeautifulSoup
# Detecting Python 3 for version-dependent implementations
PY3 = sys.version_info >= (3, 0)
@ -1125,6 +1128,62 @@ def remove_old_hosts_file(backup):
open(old_file_path, "a").close()
# End File Logic
def domain_to_idna(line):
"""
Encode a domain which is presente into a line into `idna`. This way we avoid
the most encoding issue case.
Parameters
----------
line : str
The line we have to encode/decode.
Returns
-------
line : str
The line in a converted format.
Notes
-----
- This method/function encode only the domain to `idna` format because in
most cases the encoding issue is due to a domain which looks like
`b'\xc9\xa2oogle.com'.decode('idna')`.
- About the splitting:
We split because we only want to encode the domain and not the full line
which may cause some issue. Keep in mind that we split but we still
concatenate once we encoded the domain.
- The following split the prefix `0.0.0.0` or `127.0.0.1` of a line.
- The following also split the trailing comment of a given line.
- You do not get it ?
- Run https://git.io/vA1Rj and enjoy the view :-).
"""
if not line.startswith('#'):
for separator in [' ', '\t']:
comment_to_append = ''
if separator in line:
splited_line = line.split(separator)
if '#' in splited_line[1]:
comment_to_append = splited_line[1].split('#')[1]
if comment_to_append:
splited_line[1] = splited_line[1] \
.split(comment_to_append)[0] \
.encode("IDNA").decode("UTF-8") + \
'#' + comment_to_append[1]
else:
splited_line[1] = splited_line[1] \
.encode("IDNA") \
.decode("UTF-8") + '#'
else:
splited_line[1] = splited_line[1] \
.encode("IDNA") \
.decode("UTF-8")
return separator.join(splited_line)
return line.encode("IDNA").decode("UTF-8")
return line.encode("UTF-8").decode("UTF-8")
# Helper Functions
def get_file_by_url(url):
@ -1141,11 +1200,17 @@ def get_file_by_url(url):
url_data : str or None
The data retrieved at that URL from the file. Returns None if the
attempted retrieval is unsuccessful.
Note
----
- BeautifulSoup is used in this case to avoid having to search in which
format we have to encode or decode data before parsing it to UTF-8.
"""
try:
f = urlopen(url)
return f.read().decode("UTF-8")
soup = BeautifulSoup(f.read(),'lxml').get_text()
return '\n'.join(list(map(domain_to_idna, soup.split('\n'))))
except Exception:
print("Problem getting file: ", url)
@ -1165,7 +1230,10 @@ def write_data(f, data):
if PY3:
f.write(bytes(data, "UTF-8"))
else:
f.write(str(data).encode("UTF-8"))
try:
f.write(str(data))
except UnicodeEncodeError:
f.write(str(data.encode("UTF-8")))
def list_dir_no_hidden(path):