With all the helpful libraries out there, you'd think Python would have an easier way of doing this!
Using urlparse() to get the ParseResult object only allows read-only access to the attributes, making it useless if you want to set the query field.
This solution comes from Ned Batchelder from StackOverflow. I found this to be much more useful than the stuff in urlparse and urllib libraries.
Create a helper class:
import cgi, urllib, urlparse | |
class Url(object): | |
""" | |
Helper class to help add/remove GET args from an existing URL. | |
Usage: | |
url = Url('http://www.koonkii.com/apps/?type=Android&page=5#top') | |
del url.args['type'] | |
print url.url | |
>>> 'http://www.koonkii.com/apps/?page=5#top' | |
@see http://stackoverflow.com/questions/2873438/is-there-a-better-way-to-write-this-url-manipulation-in-python | |
""" | |
def __init__(self, url): | |
""" | |
Construct from a string, Url object or Django's HttpRequest. | |
""" | |
if isinstance(url, Url): | |
url = url.url | |
# Django HTTP request | |
try: | |
from django.http import HttpRequest | |
if isinstance(url, HttpRequest): | |
url = url.get_full_path() | |
except ImportError: | |
pass | |
# http://, www.koonkii.com, /apps/, ?, {'type':'Android','page':'5'}, top | |
self.scheme, self.domain, self.path, self.params, self.query, self.fragment = urlparse.urlparse(url) | |
self.args = dict(cgi.parse_qsl(self.query)) | |
@property | |
def url(self): | |
return str(self) | |
@property | |
def query_string(self): | |
""" | |
Returns args as a query string | |
""" | |
args = dict(self.args) | |
# Must be encoded in UTF-8 | |
for k, v in args.items(): | |
if isinstance(v, unicode): | |
v = v.encode('utf8') | |
elif isinstance(v, str): | |
v.decode('utf8') | |
args[k] = v | |
# In case we've modified args | |
self.query = urllib.urlencode(args) | |
return '?%s' % self.query if len(self.args) else '' | |
def __str__(self): | |
""" | |
Turn back into a URL and special-case the %UID% | |
URLs are not UTF-8 compatible and non-ascii characters should be url-encoded. | |
""" | |
self.query = urllib.urlencode(self.args) | |
return "%s" % urlparse.urlunparse((self.scheme, self.domain, self.path, self.params, self.query, self.fragment)) |
To get an idea of what's what:
01.
txt
=
"https://gist.github.com/twig/d42c4ec9ecc5eccf3614#file-python-pil-exif-reader-py-L12"
02.
url
=
Url(txt)
03.
print
"original"
, txt
04.
print
"scheme"
, url.scheme
05.
print
"domain"
, url.domain
06.
print
"path"
, url.path
07.
print
"args"
, url.args
08.
print
"params"
, url.params
09.
print
"query"
, url.query
10.
print
"query_string"
, url.query_string
11.
print
"fragment"
, url.fragment
12.
print
"URL"
, url.url
13.
print
"str"
, str(url)
Prints out:
original: https://gist.github.com/twig/d42c4ec9ecc5eccf3614?q=url&l=python#file-python-pil-exif-reader-py-L12
scheme: https
domain: gist.github.com
path: /twig/d42c4ec9ecc5eccf3614
params:
args: {'q': 'url', 'l': 'python'}
query: q=url&l=python
query_string: ?q=url&l=python
fragment: file-python-pil-exif-reader-py-L12
URL: https://gist.github.com/twig/d42c4ec9ecc5eccf3614?q=url&l=python#file-python-pil-exif-reader-py-L12
str: https://gist.github.com/twig/d42c4ec9ecc5eccf3614?q=url&l=python#file-python-pil-exif-reader-py-L12
To make changes, just use like this:
1.
u
=
Url(url)
2.
del
u.query[
'page'
]
3.
print
u.url
*update 23/08/2011*
- Removed the use of urllib.urlencode() because it's deprecated. Now uses urllib.
- Added __str__() to accompany __unicode__(). Added build() method.
- Renamed args to be query.
*update 11/11/2014*
- Cleaned up the code a bit
- Added Url.url and Url.query_string attributes
- Updated comments
- Renamed Url.netloc to Url.domain
- Now supports Url and Django's HttpRequest objects in __init__()