diff --git a/libs/bencode/LICENSE.txt b/libs/bencode/LICENSE.txt
deleted file mode 100644
index 4b7a6747..00000000
--- a/libs/bencode/LICENSE.txt
+++ /dev/null
@@ -1,143 +0,0 @@
-BitTorrent Open Source License
-
-Version 1.1
-
-This BitTorrent Open Source License (the "License") applies to the BitTorrent client and related software products as well as any updates or maintenance releases of that software ("BitTorrent Products") that are distributed by BitTorrent, Inc. ("Licensor"). Any BitTorrent Product licensed pursuant to this License is a Licensed Product. Licensed Product, in its entirety, is protected by U.S. copyright law. This License identifies the terms under which you may use, copy, distribute or modify Licensed Product.
-
-Preamble
-
-This Preamble is intended to describe, in plain English, the nature and scope of this License. However, this Preamble is not a part of this license. The legal effect of this License is dependent only upon the terms of the License and not this Preamble.
-
-This License complies with the Open Source Definition and is derived from the Jabber Open Source License 1.0 (the "JOSL"), which has been approved by Open Source Initiative. Sections 4(c) and 4(f)(iii) from the JOSL have been deleted.
-
-This License provides that:
-
-1. You may use or give away the Licensed Product, alone or as a component of an aggregate software distribution containing programs from several different sources. No royalty or other fee is required.
-
-2. Both Source Code and executable versions of the Licensed Product, including Modifications made by previous Contributors, are available for your use. (The terms "Licensed Product," "Modifications," "Contributors" and "Source Code" are defined in the License.)
-
-3. You are allowed to make Modifications to the Licensed Product, and you can create Derivative Works from it. (The term "Derivative Works" is defined in the License.)
-
-4. By accepting the Licensed Product under the provisions of this License, you agree that any Modifications you make to the Licensed Product and then distribute are governed by the provisions of this License. In particular, you must make the Source Code of your Modifications available to others free of charge and without a royalty.
-
-5. You may sell, accept donations or otherwise receive compensation for executable versions of a Licensed Product, without paying a royalty or other fee to the Licensor or any Contributor, provided that such executable versions contain your or another Contributor?s material Modifications. For the avoidance of doubt, to the extent your executable version of a Licensed Product does not contain your or another Contributor?s material Modifications, you may not sell, accept donations or otherwise receive compensation for such executable.
-
-You may use the Licensed Product for any purpose, but the Licensor is not providing you any warranty whatsoever, nor is the Licensor accepting any liability in the event that the Licensed Product doesn't work properly or causes you any injury or damages.
-
-6. If you sublicense the Licensed Product or Derivative Works, you may charge fees for warranty or support, or for accepting indemnity or liability obligations to your customers. You cannot charge for, sell, accept donations or otherwise receive compensation for the Source Code.
-
-7. If you assert any patent claims against the Licensor relating to the Licensed Product, or if you breach any terms of the License, your rights to the Licensed Product under this License automatically terminate.
-
-You may use this License to distribute your own Derivative Works, in which case the provisions of this License will apply to your Derivative Works just as they do to the original Licensed Product.
-
-Alternatively, you may distribute your Derivative Works under any other OSI-approved Open Source license, or under a proprietary license of your choice. If you use any license other than this License, however, you must continue to fulfill the requirements of this License (including the provisions relating to publishing the Source Code) for those portions of your Derivative Works that consist of the Licensed Product, including the files containing Modifications.
-
-New versions of this License may be published from time to time in connection with new versions of a Licensed Product or otherwise. You may choose to continue to use the license terms in this version of the License for the Licensed Product that was originally licensed hereunder, however, the new versions of this License will at all times apply to new versions of the Licensed Product released by Licensor after the release of the new version of this License. Only the Licensor has the right to change the License terms as they apply to the Licensed Product.
-
-This License relies on precise definitions for certain terms. Those terms are defined when they are first used, and the definitions are repeated for your convenience in a Glossary at the end of the License.
-
-License Terms
-
-1. Grant of License From Licensor. Subject to the terms and conditions of this License, Licensor hereby grants you a world-wide, royalty-free, non-exclusive license, subject to third party intellectual property claims, to do the following:
-
-a. Use, reproduce, modify, display, perform, sublicense and distribute any Modifications created by a Contributor or portions thereof, in both Source Code or as an executable program, either on an unmodified basis or as part of Derivative Works.
-
-b. Under claims of patents now or hereafter owned or controlled by Contributor, to make, use, sell, offer for sale, have made, and/or otherwise dispose of Modifications or portions thereof, but solely to the extent that any such claim is necessary to enable you to make, use, sell, offer for sale, have made, and/or otherwise dispose of Modifications or portions thereof or Derivative Works thereof.
-
-2. Grant of License to Modifications From Contributor. "Modifications" means any additions to or deletions from the substance or structure of (i) a file containing a Licensed Product, or (ii) any new file that contains any part of a Licensed Product. Hereinafter in this License, the term "Licensed Product" shall include all previous Modifications that you receive from any Contributor. Subject to the terms and conditions of this License, By application of the provisions in Section 4(a) below, each person or entity who created or contributed to the creation of, and distributed, a Modification (a "Contributor") hereby grants you a world-wide, royalty-free, non-exclusive license, subject to third party intellectual property claims, to do the following:
-
-a. Use, reproduce, modify, display, perform, sublicense and distribute any Modifications created by such Contributor or portions thereof, in both Source Code or as an executable program, either on an unmodified basis or as part of Derivative Works.
-
-b. Under claims of patents now or hereafter owned or controlled by Contributor, to make, use, sell, offer for sale, have made, and/or otherwise dispose of Modifications or portions thereof, but solely to the extent that any such claim is necessary to enable you to make, use, sell, offer for sale, have made, and/or otherwise dispose of Modifications or portions thereof or Derivative Works thereof.
-
-3. Exclusions From License Grant. Nothing in this License shall be deemed to grant any rights to trademarks, copyrights, patents, trade secrets or any other intellectual property of Licensor or any Contributor except as expressly stated herein. No patent license is granted separate from the Licensed Product, for code that you delete from the Licensed Product, or for combinations of the Licensed Product with other software or hardware. No right is granted to the trademarks of Licensor or any Contributor even if such marks are included in the Licensed Product. Nothing in this License shall be interpreted to prohibit Licensor from licensing under different terms from this License any code that Licensor otherwise would have a right to license. As an express condition for your use of the Licensed Product, you hereby agree that you will not, without the prior written consent of Licensor, use any trademarks, copyrights, patents, trade secrets or any other intellectual property of Licensor or any Contributor except as expressly stated herein. For the avoidance of doubt and without limiting the foregoing, you hereby agree that you will not use or display any trademark of Licensor or any Contributor in any domain name, directory filepath, advertisement, link or other reference to you in any manner or in any media.
-
-4. Your Obligations Regarding Distribution.
-
-a. Application of This License to Your Modifications. As an express condition for your use of the Licensed Product, you hereby agree that any Modifications that you create or to which you contribute, and which you distribute, are governed by the terms of this License including, without limitation, Section 2. Any Modifications that you create or to which you contribute may be distributed only under the terms of this License or a future version of this License released under Section 7. You must include a copy of this License with every copy of the Modifications you distribute. You agree not to offer or impose any terms on any Source Code or executable version of the Licensed Product or Modifications that alter or restrict the applicable version of this License or the recipients' rights hereunder. However, you may include an additional document offering the additional rights described in Section 4(d).
-
-b. Availability of Source Code. You must make available, without charge, under the terms of this License, the Source Code of the Licensed Product and any Modifications that you distribute, either on the same media as you distribute any executable or other form of the Licensed Product, or via a mechanism generally accepted in the software development community for the electronic transfer of data (an "Electronic Distribution Mechanism"). The Source Code for any version of Licensed Product or Modifications that you distribute must remain available for as long as any executable or other form of the Licensed Product is distributed by you. You are responsible for ensuring that the Source Code version remains available even if the Electronic Distribution Mechanism is maintained by a third party.
-
-c. Intellectual Property Matters.
-
- i. Third Party Claims. If you have knowledge that a license to a third party's intellectual property right is required to exercise the rights granted by this License, you must include a text file with the Source Code distribution titled "LEGAL" that describes the claim and the party making the claim in sufficient detail that a recipient will know whom to contact. If you obtain such knowledge after you make any Modifications available as described in Section 4(b), you shall promptly modify the LEGAL file in all copies you make available thereafter and shall take other steps (such as notifying appropriate mailing lists or newsgroups) reasonably calculated to inform those who received the Licensed Product from you that new knowledge has been obtained.
-
- ii. Contributor APIs. If your Modifications include an application programming interface ("API") and you have knowledge of patent licenses that are reasonably necessary to implement that API, you must also include this information in the LEGAL file.
-
- iii. Representations. You represent that, except as disclosed pursuant to 4(c)(i) above, you believe that any Modifications you distribute are your original creations and that you have sufficient rights to grant the rights conveyed by this License.
-
-d. Required Notices. You must duplicate this License in any documentation you provide along with the Source Code of any Modifications you create or to which you contribute, and which you distribute, wherever you describe recipients' rights relating to Licensed Product. You must duplicate the notice contained in Exhibit A (the "Notice") in each file of the Source Code of any copy you distribute of the Licensed Product. If you created a Modification, you may add your name as a Contributor to the Notice. If it is not possible to put the Notice in a particular Source Code file due to its structure, then you must include such Notice in a location (such as a relevant directory file) where a user would be likely to look for such a notice. You may choose to offer, and charge a fee for, warranty, support, indemnity or liability obligations to one or more recipients of Licensed Product. However, you may do so only on your own behalf, and not on behalf of the Licensor or any Contributor. You must make it clear that any such warranty, support, indemnity or liability obligation is offered by you alone, and you hereby agree to indemnify the Licensor and every Contributor for any liability incurred by the Licensor or such Contributor as a result of warranty, support, indemnity or liability terms you offer.
-
-e. Distribution of Executable Versions. You may distribute Licensed Product as an executable program under a license of your choice that may contain terms different from this License provided (i) you have satisfied the requirements of Sections 4(a) through 4(d) for that distribution, (ii) you include a conspicuous notice in the executable version, related documentation and collateral materials stating that the Source Code version of the
-Licensed Product is available under the terms of this License, including a description of how and where you have fulfilled the obligations of Section 4(b), and (iii) you make it clear that any terms that differ from this License are offered by you alone, not by Licensor or any Contributor. You hereby agree to indemnify the Licensor and every Contributor for any liability incurred by Licensor or such Contributor as a result of any terms you offer.
-
-f. Distribution of Derivative Works. You may create Derivative Works (e.g., combinations of some or all of the Licensed Product with other code) and distribute the Derivative Works as products under any other license you select, with the proviso that the requirements of this License are fulfilled for those portions of the Derivative Works that consist of the Licensed Product or any Modifications thereto.
-
-g. Compensation for Distribution of Executable Versions of Licensed Products, Modifications or Derivative Works. Notwithstanding any provision of this License to the contrary, by distributing, selling, licensing, sublicensing or otherwise making available any Licensed Product, or Modification or Derivative Work thereof, you and Licensor hereby acknowledge and agree that you may sell, license or sublicense for a fee, accept donations or otherwise receive compensation for executable versions of a Licensed Product, without paying a royalty or other fee to the Licensor or any other Contributor, provided that such executable versions (i) contain your or another Contributor?s material Modifications, or (ii) are otherwise material Derivative Works. For purposes of this License, an executable version of the Licensed Product will be deemed to contain a material Modification, or will otherwise be deemed a material Derivative Work, if (a) the Licensed Product is modified with your own or a third party?s software programs or other code, and/or the Licensed Product is combined with a number of your own or a third party?s software programs or code, respectively, and (b) such software programs or code add or contribute material value, functionality or features to the License Product. For the avoidance of doubt, to the extent your executable version of a Licensed Product does not contain your or another Contributor?s material Modifications or is otherwise not a material Derivative Work, in each case as contemplated herein, you may not sell, license or sublicense for a fee, accept donations or otherwise receive compensation for such executable. Additionally, without limitation of the foregoing and notwithstanding any provision of this License to the contrary, you cannot charge for, sell, license or sublicense for a fee, accept donations or otherwise receive compensation for the Source Code.
-
-5. Inability to Comply Due to Statute or Regulation. If it is impossible for you to comply with any of the terms of this License with respect to some or all of the Licensed Product due to statute, judicial order, or regulation, then you must (i) comply with the terms of this License to the maximum extent possible, (ii) cite the statute or regulation that prohibits you from adhering to the License, and (iii) describe the limitations and the code they affect. Such description must be included in the LEGAL file described in Section 4(d), and must be included with all distributions of the Source Code. Except to the extent prohibited by statute or regulation, such description must be sufficiently detailed for a recipient of ordinary skill at computer programming to be able to understand it.
-
-6. Application of This License. This License applies to code to which Licensor or Contributor has attached the Notice in Exhibit A, which is incorporated herein by this reference.
-
-7. Versions of This License.
-
-a. New Versions. Licensor may publish from time to time revised and/or new versions of the License.
-
-b. Effect of New Versions. Once Licensed Product has been published under a particular version of the License, you may always continue to use it under the terms of that version, provided that any such license be in full force and effect at the time, and has not been revoked or otherwise terminated. You may also choose to use such Licensed Product under the terms of any subsequent version (but not any prior version) of the License published by Licensor. No one other than Licensor has the right to modify the terms applicable to Licensed Product created under this License.
-
-c. Derivative Works of this License. If you create or use a modified version of this License, which you may do only in order to apply it to software that is not already a Licensed Product under this License, you must rename your license so that it is not confusingly similar to this License, and must make it clear that your license contains terms that differ from this License. In so naming your license, you may not use any trademark of Licensor or any Contributor.
-
-8. Disclaimer of Warranty. LICENSED PRODUCT IS PROVIDED UNDER THIS LICENSE ON AN AS IS BASIS, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE LICENSED PRODUCT IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LICENSED PRODUCT IS WITH YOU. SHOULD LICENSED PRODUCT PROVE DEFECTIVE IN ANY RESPECT, YOU (AND NOT THE LICENSOR OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS
-DISCLAIMER OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF LICENSED PRODUCT IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER.
-
-9. Termination.
-
-a. Automatic Termination Upon Breach. This license and the rights granted hereunder will terminate automatically if you fail to comply with the terms herein and fail to cure such breach within ten (10) days of being notified of the breach by the Licensor. For purposes of this provision, proof of delivery via email to the address listed in the ?WHOIS? database of the registrar for any website through which you distribute or market any Licensed Product, or to any alternate email address which you designate in writing to the Licensor, shall constitute sufficient notification. All sublicenses to the Licensed Product that are properly granted shall survive any termination of this license so long as they continue to complye with the terms of this License. Provisions that, by their nature, must remain in effect beyond the termination of this License, shall survive.
-
-b. Termination Upon Assertion of Patent Infringement. If you initiate litigation by asserting a patent infringement claim (excluding declaratory judgment actions) against Licensor or a Contributor (Licensor or Contributor against whom you file such an action is referred to herein as Respondent) alleging that Licensed Product directly or indirectly infringes any patent, then any and all rights granted by such Respondent to you under Sections 1 or 2 of this License shall terminate prospectively upon sixty (60) days notice from Respondent (the "Notice Period") unless within that Notice Period you either agree in writing (i) to pay Respondent a mutually agreeable reasonably royalty for your past or future use of Licensed Product made by such Respondent, or (ii) withdraw your litigation claim with respect to Licensed Product against such Respondent. If within said Notice Period a reasonable royalty and payment arrangement are not mutually agreed upon in writing by the parties or the litigation claim is not withdrawn, the rights granted by Licensor to you under Sections 1 and 2 automatically terminate at the expiration of said Notice Period.
-
-c. Reasonable Value of This License. If you assert a patent infringement claim against Respondent alleging that Licensed Product directly or indirectly infringes any patent where such claim is resolved (such as by license or settlement) prior to the initiation of patent infringement litigation, then the reasonable value of the licenses granted by said Respondent under Sections 1 and 2 shall be taken into account in determining the amount or value of any payment or license.
-
-d. No Retroactive Effect of Termination. In the event of termination under Sections 9(a) or 9(b) above, all end user license agreements (excluding licenses to distributors and resellers) that have been validly granted by you or any distributor hereunder prior to termination shall survive termination.
-
-10. Limitation of Liability. UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL THE LICENSOR, ANY CONTRIBUTOR, OR ANY DISTRIBUTOR OF LICENSED PRODUCT, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY RESULTING FROM SUCH PARTYS NEGLIGENCE TO THE EXTENT APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU.
-
-11. Responsibility for Claims. As between Licensor and Contributors, each party is responsible for claims and damages arising, directly or indirectly, out of its utilization of rights under this License. You agree to work with Licensor and Contributors to distribute such responsibility on an equitable basis. Nothing herein is intended or shall be deemed to constitute any admission of liability.
-
-12. U.S. Government End Users. The Licensed Product is a commercial item, as that term is defined in 48 C.F.R. 2.101 (Oct. 1995), consisting of commercial computer software and commercial computer software documentation, as such terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all U.S. Government End Users acquire Licensed Product with only those rights set forth herein.
-
-13. Miscellaneous. This License represents the complete agreement concerning the subject matter hereof. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable. This License shall be governed by California law provisions (except to the extent applicable law, if any, provides otherwise), excluding its conflict-of-law provisions. You expressly agree that in any litigation relating to this license the losing party shall be responsible for costs including, without limitation, court costs and reasonable attorneys fees and expenses. The application of the United Nations Convention on Contracts for the International Sale of Goods is expressly excluded. Any law or regulation that provides that the language of a contract shall be construed against the drafter shall not apply to this License.
-
-14. Definition of You in This License. You throughout this License, whether in upper or lower case, means an individual or a legal entity exercising rights under, and complying with all of the terms of, this License or a future version of this License issued under Section 7. For legal entities, you includes any entity that controls, is controlled by, is under common control with, or affiliated with, you. For purposes of this definition, control means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. You are responsible for advising any affiliated entity of the terms of this License, and that any rights or privileges derived from or obtained by way of this License are subject to the restrictions outlined herein.
-
-15. Glossary. All defined terms in this License that are used in more than one Section of this License are repeated here, in alphabetical order, for the convenience of the reader. The Section of this License in which each defined term is first used is shown in parentheses.
-
-Contributor: Each person or entity who created or contributed to the creation of, and distributed, a Modification. (See Section 2)
-
-Derivative Works: That term as used in this License is defined under U.S. copyright law. (See Section 1(b))
-
-License: This BitTorrent Open Source License. (See first paragraph of License)
-
-Licensed Product: Any BitTorrent Product licensed pursuant to this License. The term "Licensed Product" includes all previous Modifications from any Contributor that you receive. (See first paragraph of License and Section 2)
-
-Licensor: BitTorrent, Inc. (See first paragraph of License)
-
-Modifications: Any additions to or deletions from the substance or structure of (i) a file containing Licensed Product, or (ii) any new file that contains any part of Licensed Product. (See Section 2)
-
-Notice: The notice contained in Exhibit A. (See Section 4(e))
-
-Source Code: The preferred form for making modifications to the Licensed Product, including all modules contained therein, plus any associated interface definition files, scripts used to control compilation and installation of an executable program, or a list of differential comparisons against the Source Code of the Licensed Product. (See Section 1(a))
-
-You: This term is defined in Section 14 of this License.
-
-
-EXHIBIT A
-
-The Notice below must appear in each file of the Source Code of any copy you distribute of the Licensed Product or any hereto. Contributors to any Modifications may add their own copyright notices to identify their own contributions.
-
-License:
-
-The contents of this file are subject to the BitTorrent Open Source License Version 1.0 (the License). You may not copy or use this file, in either source code or executable form, except in compliance with the License. You may obtain a copy of the License at http://www.bittorrent.com/license/.
-
-Software distributed under the License is distributed on an AS IS basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific language governing rights and limitations under the License.
-
diff --git a/libs/bencode/__init__.py b/libs/bencode/__init__.py
index 4424fc7e..7a2af172 100644
--- a/libs/bencode/__init__.py
+++ b/libs/bencode/__init__.py
@@ -1 +1,131 @@
-from bencode import *
\ No newline at end of file
+# The contents of this file are subject to the BitTorrent Open Source License
+# Version 1.1 (the License). You may not copy or use this file, in either
+# source code or executable form, except in compliance with the License. You
+# may obtain a copy of the License at http://www.bittorrent.com/license/.
+#
+# Software distributed under the License is distributed on an AS IS basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+
+# Written by Petru Paler
+
+from BTL import BTFailure
+
+
+def decode_int(x, f):
+ f += 1
+ newf = x.index('e', f)
+ n = int(x[f:newf])
+ if x[f] == '-':
+ if x[f + 1] == '0':
+ raise ValueError
+ elif x[f] == '0' and newf != f+1:
+ raise ValueError
+ return (n, newf+1)
+
+def decode_string(x, f):
+ colon = x.index(':', f)
+ n = int(x[f:colon])
+ if x[f] == '0' and colon != f+1:
+ raise ValueError
+ colon += 1
+ return (x[colon:colon+n], colon+n)
+
+def decode_list(x, f):
+ r, f = [], f+1
+ while x[f] != 'e':
+ v, f = decode_func[x[f]](x, f)
+ r.append(v)
+ return (r, f + 1)
+
+def decode_dict(x, f):
+ r, f = {}, f+1
+ while x[f] != 'e':
+ k, f = decode_string(x, f)
+ r[k], f = decode_func[x[f]](x, f)
+ return (r, f + 1)
+
+decode_func = {}
+decode_func['l'] = decode_list
+decode_func['d'] = decode_dict
+decode_func['i'] = decode_int
+decode_func['0'] = decode_string
+decode_func['1'] = decode_string
+decode_func['2'] = decode_string
+decode_func['3'] = decode_string
+decode_func['4'] = decode_string
+decode_func['5'] = decode_string
+decode_func['6'] = decode_string
+decode_func['7'] = decode_string
+decode_func['8'] = decode_string
+decode_func['9'] = decode_string
+
+def bdecode(x):
+ try:
+ r, l = decode_func[x[0]](x, 0)
+ except (IndexError, KeyError, ValueError):
+ raise BTFailure("not a valid bencoded string")
+ if l != len(x):
+ raise BTFailure("invalid bencoded value (data after valid prefix)")
+ return r
+
+from types import StringType, IntType, LongType, DictType, ListType, TupleType
+
+
+class Bencached(object):
+
+ __slots__ = ['bencoded']
+
+ def __init__(self, s):
+ self.bencoded = s
+
+def encode_bencached(x,r):
+ r.append(x.bencoded)
+
+def encode_int(x, r):
+ r.extend(('i', str(x), 'e'))
+
+def encode_bool(x, r):
+ if x:
+ encode_int(1, r)
+ else:
+ encode_int(0, r)
+
+def encode_string(x, r):
+ r.extend((str(len(x)), ':', x))
+
+def encode_list(x, r):
+ r.append('l')
+ for i in x:
+ encode_func[type(i)](i, r)
+ r.append('e')
+
+def encode_dict(x,r):
+ r.append('d')
+ ilist = x.items()
+ ilist.sort()
+ for k, v in ilist:
+ r.extend((str(len(k)), ':', k))
+ encode_func[type(v)](v, r)
+ r.append('e')
+
+encode_func = {}
+encode_func[Bencached] = encode_bencached
+encode_func[IntType] = encode_int
+encode_func[LongType] = encode_int
+encode_func[StringType] = encode_string
+encode_func[ListType] = encode_list
+encode_func[TupleType] = encode_list
+encode_func[DictType] = encode_dict
+
+try:
+ from types import BooleanType
+ encode_func[BooleanType] = encode_bool
+except ImportError:
+ pass
+
+def bencode(x):
+ r = []
+ encode_func[type(x)](x, r)
+ return ''.join(r)
diff --git a/libs/bencode/bencode.py b/libs/bencode/bencode.py
deleted file mode 100644
index 7a2af172..00000000
--- a/libs/bencode/bencode.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# The contents of this file are subject to the BitTorrent Open Source License
-# Version 1.1 (the License). You may not copy or use this file, in either
-# source code or executable form, except in compliance with the License. You
-# may obtain a copy of the License at http://www.bittorrent.com/license/.
-#
-# Software distributed under the License is distributed on an AS IS basis,
-# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
-# for the specific language governing rights and limitations under the
-# License.
-
-# Written by Petru Paler
-
-from BTL import BTFailure
-
-
-def decode_int(x, f):
- f += 1
- newf = x.index('e', f)
- n = int(x[f:newf])
- if x[f] == '-':
- if x[f + 1] == '0':
- raise ValueError
- elif x[f] == '0' and newf != f+1:
- raise ValueError
- return (n, newf+1)
-
-def decode_string(x, f):
- colon = x.index(':', f)
- n = int(x[f:colon])
- if x[f] == '0' and colon != f+1:
- raise ValueError
- colon += 1
- return (x[colon:colon+n], colon+n)
-
-def decode_list(x, f):
- r, f = [], f+1
- while x[f] != 'e':
- v, f = decode_func[x[f]](x, f)
- r.append(v)
- return (r, f + 1)
-
-def decode_dict(x, f):
- r, f = {}, f+1
- while x[f] != 'e':
- k, f = decode_string(x, f)
- r[k], f = decode_func[x[f]](x, f)
- return (r, f + 1)
-
-decode_func = {}
-decode_func['l'] = decode_list
-decode_func['d'] = decode_dict
-decode_func['i'] = decode_int
-decode_func['0'] = decode_string
-decode_func['1'] = decode_string
-decode_func['2'] = decode_string
-decode_func['3'] = decode_string
-decode_func['4'] = decode_string
-decode_func['5'] = decode_string
-decode_func['6'] = decode_string
-decode_func['7'] = decode_string
-decode_func['8'] = decode_string
-decode_func['9'] = decode_string
-
-def bdecode(x):
- try:
- r, l = decode_func[x[0]](x, 0)
- except (IndexError, KeyError, ValueError):
- raise BTFailure("not a valid bencoded string")
- if l != len(x):
- raise BTFailure("invalid bencoded value (data after valid prefix)")
- return r
-
-from types import StringType, IntType, LongType, DictType, ListType, TupleType
-
-
-class Bencached(object):
-
- __slots__ = ['bencoded']
-
- def __init__(self, s):
- self.bencoded = s
-
-def encode_bencached(x,r):
- r.append(x.bencoded)
-
-def encode_int(x, r):
- r.extend(('i', str(x), 'e'))
-
-def encode_bool(x, r):
- if x:
- encode_int(1, r)
- else:
- encode_int(0, r)
-
-def encode_string(x, r):
- r.extend((str(len(x)), ':', x))
-
-def encode_list(x, r):
- r.append('l')
- for i in x:
- encode_func[type(i)](i, r)
- r.append('e')
-
-def encode_dict(x,r):
- r.append('d')
- ilist = x.items()
- ilist.sort()
- for k, v in ilist:
- r.extend((str(len(k)), ':', k))
- encode_func[type(v)](v, r)
- r.append('e')
-
-encode_func = {}
-encode_func[Bencached] = encode_bencached
-encode_func[IntType] = encode_int
-encode_func[LongType] = encode_int
-encode_func[StringType] = encode_string
-encode_func[ListType] = encode_list
-encode_func[TupleType] = encode_list
-encode_func[DictType] = encode_dict
-
-try:
- from types import BooleanType
- encode_func[BooleanType] = encode_bool
-except ImportError:
- pass
-
-def bencode(x):
- r = []
- encode_func[type(x)](x, r)
- return ''.join(r)
diff --git a/libs/bs4/__init__.py b/libs/bs4/__init__.py
index af8c718d..7ba34269 100644
--- a/libs/bs4/__init__.py
+++ b/libs/bs4/__init__.py
@@ -17,16 +17,17 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.1.0"
-__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
+__version__ = "4.3.2"
+__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
+import os
import re
import warnings
-from .builder import builder_registry
+from .builder import builder_registry, ParserRejectedMarkup
from .dammit import UnicodeDammit
from .element import (
CData,
@@ -74,11 +75,7 @@ class BeautifulSoup(Tag):
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
- # Used when determining whether a text node is all whitespace and
- # can be replaced with a single space. A text node that contains
- # fancy Unicode spaces (usually non-breaking) should be left
- # alone.
- STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
+ ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, **kwargs):
@@ -149,7 +146,7 @@ class BeautifulSoup(Tag):
features = self.DEFAULT_BUILDER_FEATURES
builder_class = builder_registry.lookup(*features)
if builder_class is None:
- raise ValueError(
+ raise FeatureNotFound(
"Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
@@ -160,18 +157,46 @@ class BeautifulSoup(Tag):
self.parse_only = parse_only
- self.reset()
-
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
- (self.markup, self.original_encoding, self.declared_html_encoding,
- self.contains_replacement_characters) = (
- self.builder.prepare_markup(markup, from_encoding))
+ elif len(markup) <= 256:
+ # Print out warnings for a couple beginner problems
+ # involving passing non-markup to Beautiful Soup.
+ # Beautiful Soup will still parse the input as markup,
+ # just in case that's what the user really wants.
+ if (isinstance(markup, unicode)
+ and not os.path.supports_unicode_filenames):
+ possible_filename = markup.encode("utf8")
+ else:
+ possible_filename = markup
+ is_file = False
+ try:
+ is_file = os.path.exists(possible_filename)
+ except Exception, e:
+ # This is almost certainly a problem involving
+ # characters not valid in filenames on this
+ # system. Just let it go.
+ pass
+ if is_file:
+ warnings.warn(
+ '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
+ if markup[:5] == "http:" or markup[:6] == "https:":
+ # TODO: This is ugly but I couldn't get it to work in
+ # Python 3 otherwise.
+ if ((isinstance(markup, bytes) and not b' ' in markup)
+ or (isinstance(markup, unicode) and not u' ' in markup)):
+ warnings.warn(
+ '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
- try:
- self._feed()
- except StopParsing:
- pass
+ for (self.markup, self.original_encoding, self.declared_html_encoding,
+ self.contains_replacement_characters) in (
+ self.builder.prepare_markup(markup, from_encoding)):
+ self.reset()
+ try:
+ self._feed()
+ break
+ except ParserRejectedMarkup:
+ pass
# Clear out the markup and remove the builder's circular
# reference to this object.
@@ -192,29 +217,32 @@ class BeautifulSoup(Tag):
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
self.hidden = 1
self.builder.reset()
- self.currentData = []
+ self.current_data = []
self.currentTag = None
self.tagStack = []
+ self.preserve_whitespace_tag_stack = []
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
"""Create a new tag associated with this soup."""
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
- def new_string(self, s):
+ def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup."""
- navigable = NavigableString(s)
+ navigable = subclass(s)
navigable.setup()
return navigable
def insert_before(self, successor):
- raise ValueError("BeautifulSoup objects don't support insert_before().")
+ raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
def insert_after(self, successor):
- raise ValueError("BeautifulSoup objects don't support insert_after().")
+ raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
def popTag(self):
tag = self.tagStack.pop()
+ if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
+ self.preserve_whitespace_tag_stack.pop()
#print "Pop", tag.name
if self.tagStack:
self.currentTag = self.tagStack[-1]
@@ -226,32 +254,49 @@ class BeautifulSoup(Tag):
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
+ if tag.name in self.builder.preserve_whitespace_tags:
+ self.preserve_whitespace_tag_stack.append(tag)
def endData(self, containerClass=NavigableString):
- if self.currentData:
- currentData = u''.join(self.currentData)
- if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
- not set([tag.name for tag in self.tagStack]).intersection(
- self.builder.preserve_whitespace_tags)):
- if '\n' in currentData:
- currentData = '\n'
- else:
- currentData = ' '
- self.currentData = []
+ if self.current_data:
+ current_data = u''.join(self.current_data)
+ # If whitespace is not preserved, and this string contains
+ # nothing but ASCII spaces, replace it with a single space
+ # or newline.
+ if not self.preserve_whitespace_tag_stack:
+ strippable = True
+ for i in current_data:
+ if i not in self.ASCII_SPACES:
+ strippable = False
+ break
+ if strippable:
+ if '\n' in current_data:
+ current_data = '\n'
+ else:
+ current_data = ' '
+
+ # Reset the data collector.
+ self.current_data = []
+
+ # Should we add this string to the tree at all?
if self.parse_only and len(self.tagStack) <= 1 and \
(not self.parse_only.text or \
- not self.parse_only.search(currentData)):
+ not self.parse_only.search(current_data)):
return
- o = containerClass(currentData)
+
+ o = containerClass(current_data)
self.object_was_parsed(o)
- def object_was_parsed(self, o):
+ def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree."""
- o.setup(self.currentTag, self.previous_element)
- if self.previous_element:
- self.previous_element.next_element = o
- self.previous_element = o
- self.currentTag.contents.append(o)
+ parent = parent or self.currentTag
+ most_recent_element = most_recent_element or self._most_recent_element
+ o.setup(parent, most_recent_element)
+
+ if most_recent_element is not None:
+ most_recent_element.next_element = o
+ self._most_recent_element = o
+ parent.contents.append(o)
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
@@ -260,22 +305,21 @@ class BeautifulSoup(Tag):
the given tag."""
#print "Popping to %s" % name
if name == self.ROOT_TAG_NAME:
+ # The BeautifulSoup object itself can never be popped.
return
- numPops = 0
- mostRecentTag = None
+ most_recently_popped = None
- for i in range(len(self.tagStack) - 1, 0, -1):
- if (name == self.tagStack[i].name
- and nsprefix == self.tagStack[i].nsprefix == nsprefix):
- numPops = len(self.tagStack) - i
+ stack_size = len(self.tagStack)
+ for i in range(stack_size - 1, 0, -1):
+ t = self.tagStack[i]
+ if (name == t.name and nsprefix == t.prefix):
+ if inclusivePop:
+ most_recently_popped = self.popTag()
break
- if not inclusivePop:
- numPops = numPops - 1
+ most_recently_popped = self.popTag()
- for i in range(0, numPops):
- mostRecentTag = self.popTag()
- return mostRecentTag
+ return most_recently_popped
def handle_starttag(self, name, namespace, nsprefix, attrs):
"""Push a start tag on to the stack.
@@ -295,12 +339,12 @@ class BeautifulSoup(Tag):
return None
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
- self.currentTag, self.previous_element)
+ self.currentTag, self._most_recent_element)
if tag is None:
return tag
- if self.previous_element:
- self.previous_element.next_element = tag
- self.previous_element = tag
+ if self._most_recent_element:
+ self._most_recent_element.next_element = tag
+ self._most_recent_element = tag
self.pushTag(tag)
return tag
@@ -310,7 +354,7 @@ class BeautifulSoup(Tag):
self._popToTag(name, nsprefix)
def handle_data(self, data):
- self.currentData.append(data)
+ self.current_data.append(data)
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
@@ -333,6 +377,10 @@ class BeautifulSoup(Tag):
return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter)
+# Alias to make it easier to type import: 'from bs4 import _soup'
+_s = BeautifulSoup
+_soup = BeautifulSoup
+
class BeautifulStoneSoup(BeautifulSoup):
"""Deprecated interface to an XML parser."""
@@ -347,6 +395,9 @@ class BeautifulStoneSoup(BeautifulSoup):
class StopParsing(Exception):
pass
+class FeatureNotFound(ValueError):
+ pass
+
#By default, act as an HTML pretty-printer.
if __name__ == '__main__':
diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py
index 4c22b864..740f5f29 100644
--- a/libs/bs4/builder/__init__.py
+++ b/libs/bs4/builder/__init__.py
@@ -147,18 +147,29 @@ class TreeBuilder(object):
Modifies its input in place.
"""
+ if not attrs:
+ return attrs
if self.cdata_list_attributes:
universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get(
- tag_name.lower(), [])
- for cdata_list_attr in itertools.chain(universal, tag_specific):
- if cdata_list_attr in dict(attrs):
- # Basically, we have a "class" attribute whose
- # value is a whitespace-separated list of CSS
- # classes. Split it into a list.
- value = attrs[cdata_list_attr]
- values = whitespace_re.split(value)
- attrs[cdata_list_attr] = values
+ tag_name.lower(), None)
+ for attr in attrs.keys():
+ if attr in universal or (tag_specific and attr in tag_specific):
+ # We have a "class"-type attribute whose string
+ # value is a whitespace-separated list of
+ # values. Split it into a list.
+ value = attrs[attr]
+ if isinstance(value, basestring):
+ values = whitespace_re.split(value)
+ else:
+ # html5lib sometimes calls setAttributes twice
+ # for the same tag when rearranging the parse
+ # tree. On the second call the attribute value
+ # here is already a list. If this happens,
+ # leave the value alone rather than trying to
+ # split it again.
+ values = value
+ attrs[attr] = values
return attrs
class SAXTreeBuilder(TreeBuilder):
@@ -287,6 +298,9 @@ def register_treebuilders_from(module):
# Register the builder while we're at it.
this_module.builder_registry.register(obj)
+class ParserRejectedMarkup(Exception):
+ pass
+
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
diff --git a/libs/bs4/builder/_html5lib.py b/libs/bs4/builder/_html5lib.py
index 6001e386..7de36ae7 100644
--- a/libs/bs4/builder/_html5lib.py
+++ b/libs/bs4/builder/_html5lib.py
@@ -27,7 +27,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def prepare_markup(self, markup, user_specified_encoding):
# Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding
- return markup, None, None, False
+ yield (markup, None, None, False)
# These methods are defined by Beautiful Soup.
def feed(self, markup):
@@ -123,17 +123,50 @@ class Element(html5lib.treebuilders._base.Node):
self.namespace = namespace
def appendChild(self, node):
- if (node.element.__class__ == NavigableString and self.element.contents
+ string_child = child = None
+ if isinstance(node, basestring):
+ # Some other piece of code decided to pass in a string
+ # instead of creating a TextElement object to contain the
+ # string.
+ string_child = child = node
+ elif isinstance(node, Tag):
+ # Some other piece of code decided to pass in a Tag
+ # instead of creating an Element object to contain the
+ # Tag.
+ child = node
+ elif node.element.__class__ == NavigableString:
+ string_child = child = node.element
+ else:
+ child = node.element
+
+ if not isinstance(child, basestring) and child.parent is not None:
+ node.element.extract()
+
+ if (string_child and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
- # Concatenate new text onto old text node
- # XXX This has O(n^2) performance, for input like
+ # We are appending a string onto another string.
+ # TODO This has O(n^2) performance, for input like
# "aaa..."
old_element = self.element.contents[-1]
- new_element = self.soup.new_string(old_element + node.element)
+ new_element = self.soup.new_string(old_element + string_child)
old_element.replace_with(new_element)
+ self.soup._most_recent_element = new_element
else:
- self.element.append(node.element)
- node.parent = self
+ if isinstance(node, basestring):
+ # Create a brand new NavigableString from this string.
+ child = self.soup.new_string(node)
+
+ # Tell Beautiful Soup to act as if it parsed this element
+ # immediately after the parent's last descendant. (Or
+ # immediately after the parent, if it has no children.)
+ if self.element.contents:
+ most_recent_element = self.element._last_descendant(False)
+ else:
+ most_recent_element = self.element
+
+ self.soup.object_was_parsed(
+ child, parent=self.element,
+ most_recent_element=most_recent_element)
def getAttributes(self):
return AttrList(self.element)
@@ -162,11 +195,11 @@ class Element(html5lib.treebuilders._base.Node):
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
- text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore:
- self.insertBefore(text, insertBefore)
+ text = TextNode(self.soup.new_string(data), self.soup)
+ self.insertBefore(data, insertBefore)
else:
- self.appendChild(text)
+ self.appendChild(data)
def insertBefore(self, node, refNode):
index = self.element.index(refNode.element)
@@ -183,16 +216,46 @@ class Element(html5lib.treebuilders._base.Node):
def removeChild(self, node):
node.element.extract()
- def reparentChildren(self, newParent):
- while self.element.contents:
- child = self.element.contents[0]
- child.extract()
- if isinstance(child, Tag):
- newParent.appendChild(
- Element(child, self.soup, namespaces["html"]))
- else:
- newParent.appendChild(
- TextNode(child, self.soup))
+ def reparentChildren(self, new_parent):
+ """Move all of this tag's children into another tag."""
+ element = self.element
+ new_parent_element = new_parent.element
+ # Determine what this tag's next_element will be once all the children
+ # are removed.
+ final_next_element = element.next_sibling
+
+ new_parents_last_descendant = new_parent_element._last_descendant(False, False)
+ if len(new_parent_element.contents) > 0:
+ # The new parent already contains children. We will be
+ # appending this tag's children to the end.
+ new_parents_last_child = new_parent_element.contents[-1]
+ new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
+ else:
+ # The new parent contains no children.
+ new_parents_last_child = None
+ new_parents_last_descendant_next_element = new_parent_element.next_element
+
+ to_append = element.contents
+ append_after = new_parent.element.contents
+ if len(to_append) > 0:
+ # Set the first child's previous_element and previous_sibling
+ # to elements within the new parent
+ first_child = to_append[0]
+ first_child.previous_element = new_parents_last_descendant
+ first_child.previous_sibling = new_parents_last_child
+
+ # Fix the last child's next_element and next_sibling
+ last_child = to_append[-1]
+ last_child.next_element = new_parents_last_descendant_next_element
+ last_child.next_sibling = None
+
+ for child in to_append:
+ child.parent = new_parent_element
+ new_parent_element.contents.append(child)
+
+ # Now that this element has no children, change its .next_element.
+ element.contents = []
+ element.next_element = final_next_element
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)
diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py
index ede5cecb..ca8d8b89 100644
--- a/libs/bs4/builder/_htmlparser.py
+++ b/libs/bs4/builder/_htmlparser.py
@@ -45,7 +45,15 @@ HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
def handle_starttag(self, name, attrs):
# XXX namespace
- self.soup.handle_starttag(name, None, None, dict(attrs))
+ attr_dict = {}
+ for key, value in attrs:
+ # Change None attribute values to the empty string
+ # for consistency with the other tree builders.
+ if value is None:
+ value = ''
+ attr_dict[key] = value
+ attrvalue = '""'
+ self.soup.handle_starttag(name, None, None, attr_dict)
def handle_endtag(self, name):
self.soup.handle_endtag(name)
@@ -58,6 +66,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
# it's fixed.
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
+ elif name.startswith('X'):
+ real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
@@ -85,6 +95,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.endData()
if data.startswith("DOCTYPE "):
data = data[len("DOCTYPE "):]
+ elif data == 'DOCTYPE':
+ # i.e. ""
+ data = ''
self.soup.handle_data(data)
self.soup.endData(Doctype)
@@ -130,13 +143,14 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
replaced with REPLACEMENT CHARACTER).
"""
if isinstance(markup, unicode):
- return markup, None, None, False
+ yield (markup, None, None, False)
+ return
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
- return (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding,
- dammit.contains_replacement_characters)
+ yield (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding,
+ dammit.contains_replacement_characters)
def feed(self, markup):
args, kwargs = self.parser_args
diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py
index c78fdff6..fa5d4987 100644
--- a/libs/bs4/builder/_lxml.py
+++ b/libs/bs4/builder/_lxml.py
@@ -3,6 +3,7 @@ __all__ = [
'LXMLTreeBuilder',
]
+from io import BytesIO
from StringIO import StringIO
import collections
from lxml import etree
@@ -12,9 +13,10 @@ from bs4.builder import (
HTML,
HTMLTreeBuilder,
PERMISSIVE,
+ ParserRejectedMarkup,
TreeBuilder,
XML)
-from bs4.dammit import UnicodeDammit
+from bs4.dammit import EncodingDetector
LXML = 'lxml'
@@ -28,24 +30,36 @@ class LXMLTreeBuilderForXML(TreeBuilder):
CHUNK_SIZE = 512
- @property
- def default_parser(self):
+ # This namespace mapping is specified in the XML Namespace
+ # standard.
+ DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+
+ def default_parser(self, encoding):
# This can either return a parser object or a class, which
# will be instantiated with default arguments.
- return etree.XMLParser(target=self, strip_cdata=False, recover=True)
+ if self._default_parser is not None:
+ return self._default_parser
+ return etree.XMLParser(
+ target=self, strip_cdata=False, recover=True, encoding=encoding)
+
+ def parser_for(self, encoding):
+ # Use the default parser.
+ parser = self.default_parser(encoding)
- def __init__(self, parser=None, empty_element_tags=None):
- if empty_element_tags is not None:
- self.empty_element_tags = set(empty_element_tags)
- if parser is None:
- # Use the default parser.
- parser = self.default_parser
if isinstance(parser, collections.Callable):
# Instantiate the parser with default arguments
- parser = parser(target=self, strip_cdata=False)
- self.parser = parser
+ parser = parser(target=self, strip_cdata=False, encoding=encoding)
+ return parser
+
+ def __init__(self, parser=None, empty_element_tags=None):
+ # TODO: Issue a warning if parser is present but not a
+ # callable, since that means there's no way to create new
+ # parsers for different encodings.
+ self._default_parser = parser
+ if empty_element_tags is not None:
+ self.empty_element_tags = set(empty_element_tags)
self.soup = None
- self.nsmaps = None
+ self.nsmaps = [self.DEFAULT_NSMAPS]
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
@@ -58,50 +72,69 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
"""
- :return: A 3-tuple (markup, original encoding, encoding
- declared within markup).
+ :yield: A series of 4-tuples.
+ (markup, encoding, declared encoding,
+ has undergone character replacement)
+
+ Each 4-tuple represents a strategy for parsing the document.
"""
if isinstance(markup, unicode):
- return markup, None, None, False
+ # We were given Unicode. Maybe lxml can parse Unicode on
+ # this system?
+ yield markup, None, document_declared_encoding, False
+ if isinstance(markup, unicode):
+ # No, apparently not. Convert the Unicode to UTF-8 and
+ # tell lxml to parse it as UTF-8.
+ yield (markup.encode("utf8"), "utf8",
+ document_declared_encoding, False)
+
+ # Instead of using UnicodeDammit to convert the bytestring to
+ # Unicode using different encodings, use EncodingDetector to
+ # iterate over the encodings, and tell lxml to try to parse
+ # the document as each one in turn.
+ is_html = not self.is_xml
try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, is_html=True)
- return (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding,
- dammit.contains_replacement_characters)
+ detector = EncodingDetector(markup, try_encodings, is_html)
+ for encoding in detector.encodings:
+ yield (detector.markup, encoding, document_declared_encoding, False)
def feed(self, markup):
- if isinstance(markup, basestring):
+ if isinstance(markup, bytes):
+ markup = BytesIO(markup)
+ elif isinstance(markup, unicode):
markup = StringIO(markup)
+
# Call feed() at least once, even if the markup is empty,
# or the parser won't be initialized.
data = markup.read(self.CHUNK_SIZE)
- self.parser.feed(data)
- while data != '':
- # Now call feed() on the rest of the data, chunk by chunk.
- data = markup.read(self.CHUNK_SIZE)
- if data != '':
- self.parser.feed(data)
- self.parser.close()
+ try:
+ self.parser = self.parser_for(self.soup.original_encoding)
+ self.parser.feed(data)
+ while len(data) != 0:
+ # Now call feed() on the rest of the data, chunk by chunk.
+ data = markup.read(self.CHUNK_SIZE)
+ if len(data) != 0:
+ self.parser.feed(data)
+ self.parser.close()
+ except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+ raise ParserRejectedMarkup(str(e))
def close(self):
- self.nsmaps = None
+ self.nsmaps = [self.DEFAULT_NSMAPS]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs)
-
nsprefix = None
# Invert each namespace map as it comes in.
- if len(nsmap) == 0 and self.nsmaps != None:
- # There are no new namespaces for this tag, but namespaces
- # are in play, so we need a separate tag stack to know
- # when they end.
+ if len(self.nsmaps) > 1:
+ # There are no new namespaces for this tag, but
+ # non-default namespaces are in play, so we need a
+ # separate tag stack to know when they end.
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
- if self.nsmaps is None:
- self.nsmaps = []
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap)
# Also treat the namespace mapping as a set of attributes on the
@@ -111,14 +144,34 @@ class LXMLTreeBuilderForXML(TreeBuilder):
attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
+
+ # Namespaces are in play. Find any attributes that came in
+ # from lxml with namespaces attached to their names, and
+ # turn then into NamespacedAttribute objects.
+ new_attrs = {}
+ for attr, value in attrs.items():
+ namespace, attr = self._getNsTag(attr)
+ if namespace is None:
+ new_attrs[attr] = value
+ else:
+ nsprefix = self._prefix_for_namespace(namespace)
+ attr = NamespacedAttribute(nsprefix, attr, namespace)
+ new_attrs[attr] = value
+ attrs = new_attrs
+
namespace, name = self._getNsTag(name)
- if namespace is not None:
- for inverted_nsmap in reversed(self.nsmaps):
- if inverted_nsmap is not None and namespace in inverted_nsmap:
- nsprefix = inverted_nsmap[namespace]
- break
+ nsprefix = self._prefix_for_namespace(namespace)
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
+ def _prefix_for_namespace(self, namespace):
+ """Find the currently active prefix for the given namespace."""
+ if namespace is None:
+ return None
+ for inverted_nsmap in reversed(self.nsmaps):
+ if inverted_nsmap is not None and namespace in inverted_nsmap:
+ return inverted_nsmap[namespace]
+ return None
+
def end(self, name):
self.soup.endData()
completed_tag = self.soup.tagStack[-1]
@@ -130,14 +183,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
nsprefix = inverted_nsmap[namespace]
break
self.soup.handle_endtag(name, nsprefix)
- if self.nsmaps != None:
+ if len(self.nsmaps) > 1:
# This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack.
self.nsmaps.pop()
- if len(self.nsmaps) == 0:
- # Namespaces are no longer in play, so don't bother keeping
- # track of the namespace stack.
- self.nsmaps = None
def pi(self, target, data):
pass
@@ -166,13 +215,18 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = [LXML, HTML, FAST, PERMISSIVE]
is_xml = False
- @property
- def default_parser(self):
+ def default_parser(self, encoding):
return etree.HTMLParser
def feed(self, markup):
- self.parser.feed(markup)
- self.parser.close()
+ encoding = self.soup.original_encoding
+ try:
+ self.parser = self.parser_for(encoding)
+ self.parser.feed(markup)
+ self.parser.close()
+ except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+ raise ParserRejectedMarkup(str(e))
+
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
diff --git a/libs/bs4/dammit.py b/libs/bs4/dammit.py
index 58cad9ba..59640b7c 100644
--- a/libs/bs4/dammit.py
+++ b/libs/bs4/dammit.py
@@ -1,27 +1,40 @@
# -*- coding: utf-8 -*-
"""Beautiful Soup bonus library: Unicode, Dammit
-This class forces XML data into a standard format (usually to UTF-8 or
-Unicode). It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It does not rewrite the XML or HTML to reflect a new
-encoding; that's the tree builder's job.
+This library converts a bytestream to Unicode through any means
+necessary. It is heavily based on code from Mark Pilgrim's Universal
+Feed Parser. It works best on XML and XML, but it does not rewrite the
+XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
import codecs
from htmlentitydefs import codepoint2name
import re
-import warnings
+import logging
+import string
-# Autodetects character encodings. Very useful.
-# Download from http://chardet.feedparser.org/
-# or 'apt-get install python-chardet'
-# or 'easy_install chardet'
+# Import a library to autodetect character encodings.
+chardet_type = None
try:
- import chardet
- #import chardet.constants
- #chardet.constants._debug = 1
+ # First try the fast C implementation.
+ # PyPI package: cchardet
+ import cchardet
+ def chardet_dammit(s):
+ return cchardet.detect(s)['encoding']
except ImportError:
- chardet = None
+ try:
+ # Fall back to the pure Python implementation
+ # Debian package: python-chardet
+ # PyPI package: chardet
+ import chardet
+ def chardet_dammit(s):
+ return chardet.detect(s)['encoding']
+ #import chardet.constants
+ #chardet.constants._debug = 1
+ except ImportError:
+ # No chardet available.
+ def chardet_dammit(s):
+ return None
# Available from http://cjkpython.i18n.org/.
try:
@@ -69,6 +82,8 @@ class EntitySubstitution(object):
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")")
+ AMPERSAND_OR_BRACKET = re.compile("([<>&])")
+
@classmethod
def _substitute_html_entity(cls, matchobj):
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
@@ -122,6 +137,28 @@ class EntitySubstitution(object):
def substitute_xml(cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
+ :param value: A string to be substituted. The less-than sign
+ will become <, the greater-than sign will become >,
+ and any ampersands will become &. If you want ampersands
+ that appear to be part of an entity definition to be left
+ alone, use substitute_xml_containing_entities() instead.
+
+ :param make_quoted_attribute: If True, then the string will be
+ quoted, as befits an attribute value.
+ """
+ # Escape angle brackets and ampersands.
+ value = cls.AMPERSAND_OR_BRACKET.sub(
+ cls._substitute_xml_entity, value)
+
+ if make_quoted_attribute:
+ value = cls.quoted_attribute_value(value)
+ return value
+
+ @classmethod
+ def substitute_xml_containing_entities(
+ cls, value, make_quoted_attribute=False):
+ """Substitute XML entities for special XML characters.
+
:param value: A string to be substituted. The less-than sign will
become <, the greater-than sign will become >, and any
ampersands that are not part of an entity defition will
@@ -155,6 +192,125 @@ class EntitySubstitution(object):
cls._substitute_html_entity, s)
+class EncodingDetector:
+ """Suggests a number of possible encodings for a bytestring.
+
+ Order of precedence:
+
+ 1. Encodings you specifically tell EncodingDetector to try first
+ (the override_encodings argument to the constructor).
+
+ 2. An encoding declared within the bytestring itself, either in an
+ XML declaration (if the bytestring is to be interpreted as an XML
+ document), or in a tag (if the bytestring is to be
+ interpreted as an HTML document.)
+
+ 3. An encoding detected through textual analysis by chardet,
+ cchardet, or a similar external library.
+
+ 4. UTF-8.
+
+ 5. Windows-1252.
+ """
+ def __init__(self, markup, override_encodings=None, is_html=False):
+ self.override_encodings = override_encodings or []
+ self.chardet_encoding = None
+ self.is_html = is_html
+ self.declared_encoding = None
+
+ # First order of business: strip a byte-order mark.
+ self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
+
+ def _usable(self, encoding, tried):
+ if encoding is not None:
+ encoding = encoding.lower()
+ if encoding not in tried:
+ tried.add(encoding)
+ return True
+ return False
+
+ @property
+ def encodings(self):
+ """Yield a number of encodings that might work for this markup."""
+ tried = set()
+ for e in self.override_encodings:
+ if self._usable(e, tried):
+ yield e
+
+ # Did the document originally start with a byte-order mark
+ # that indicated its encoding?
+ if self._usable(self.sniffed_encoding, tried):
+ yield self.sniffed_encoding
+
+ # Look within the document for an XML or HTML encoding
+ # declaration.
+ if self.declared_encoding is None:
+ self.declared_encoding = self.find_declared_encoding(
+ self.markup, self.is_html)
+ if self._usable(self.declared_encoding, tried):
+ yield self.declared_encoding
+
+ # Use third-party character set detection to guess at the
+ # encoding.
+ if self.chardet_encoding is None:
+ self.chardet_encoding = chardet_dammit(self.markup)
+ if self._usable(self.chardet_encoding, tried):
+ yield self.chardet_encoding
+
+ # As a last-ditch effort, try utf-8 and windows-1252.
+ for e in ('utf-8', 'windows-1252'):
+ if self._usable(e, tried):
+ yield e
+
+ @classmethod
+ def strip_byte_order_mark(cls, data):
+ """If a byte-order mark is present, strip it and return the encoding it implies."""
+ encoding = None
+ if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16be'
+ data = data[2:]
+ elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == b'\xef\xbb\xbf':
+ encoding = 'utf-8'
+ data = data[3:]
+ elif data[:4] == b'\x00\x00\xfe\xff':
+ encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == b'\xff\xfe\x00\x00':
+ encoding = 'utf-32le'
+ data = data[4:]
+ return data, encoding
+
+ @classmethod
+ def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
+ """Given a document, tries to find its declared encoding.
+
+ An XML encoding is declared at the beginning of the document.
+
+ An HTML encoding is declared in a tag, hopefully near the
+ beginning of the document.
+ """
+ if search_entire_document:
+ xml_endpos = html_endpos = len(markup)
+ else:
+ xml_endpos = 1024
+ html_endpos = max(2048, int(len(markup) * 0.05))
+
+ declared_encoding = None
+ declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
+ if not declared_encoding_match and is_html:
+ declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
+ if declared_encoding_match is not None:
+ declared_encoding = declared_encoding_match.groups()[0].decode(
+ 'ascii')
+ if declared_encoding:
+ return declared_encoding.lower()
+ return None
+
class UnicodeDammit:
"""A class for detecting the encoding of a *ML document and
converting it to a Unicode string. If the source encoding is
@@ -176,65 +332,48 @@ class UnicodeDammit:
def __init__(self, markup, override_encodings=[],
smart_quotes_to=None, is_html=False):
- self.declared_html_encoding = None
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
+ self.is_html = is_html
- if markup == '' or isinstance(markup, unicode):
+ self.detector = EncodingDetector(markup, override_encodings, is_html)
+
+ # Short-circuit if the data is in Unicode to begin with.
+ if isinstance(markup, unicode) or markup == '':
self.markup = markup
self.unicode_markup = unicode(markup)
self.original_encoding = None
return
- new_markup, document_encoding, sniffed_encoding = \
- self._detectEncoding(markup, is_html)
- self.markup = new_markup
+ # The encoding detector may have stripped a byte-order mark.
+ # Use the stripped markup from this point on.
+ self.markup = self.detector.markup
u = None
- if new_markup != markup:
- # _detectEncoding modified the markup, then converted it to
- # Unicode and then to UTF-8. So convert it from UTF-8.
- u = self._convert_from("utf8")
- self.original_encoding = sniffed_encoding
+ for encoding in self.detector.encodings:
+ markup = self.detector.markup
+ u = self._convert_from(encoding)
+ if u is not None:
+ break
if not u:
- for proposed_encoding in (
- override_encodings + [document_encoding, sniffed_encoding]):
- if proposed_encoding is not None:
- u = self._convert_from(proposed_encoding)
- if u:
- break
+ # None of the encodings worked. As an absolute last resort,
+ # try them again with character replacement.
- # If no luck and we have auto-detection library, try that:
- if not u and chardet and not isinstance(self.markup, unicode):
- u = self._convert_from(chardet.detect(self.markup)['encoding'])
-
- # As a last resort, try utf-8 and windows-1252:
- if not u:
- for proposed_encoding in ("utf-8", "windows-1252"):
- u = self._convert_from(proposed_encoding)
- if u:
- break
-
- # As an absolute last resort, try the encodings again with
- # character replacement.
- if not u:
- for proposed_encoding in (
- override_encodings + [
- document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
- if proposed_encoding != "ascii":
- u = self._convert_from(proposed_encoding, "replace")
+ for encoding in self.detector.encodings:
+ if encoding != "ascii":
+ u = self._convert_from(encoding, "replace")
if u is not None:
- warnings.warn(
- UnicodeWarning(
+ logging.warning(
"Some characters could not be decoded, and were "
- "replaced with REPLACEMENT CHARACTER."))
+ "replaced with REPLACEMENT CHARACTER.")
self.contains_replacement_characters = True
break
- # We could at this point force it to ASCII, but that would
- # destroy so much data that I think giving up is better
+ # If none of that worked, we could at this point force it to
+ # ASCII, but that would destroy so much data that I think
+ # giving up is better.
self.unicode_markup = u
if not u:
self.original_encoding = None
@@ -262,11 +401,10 @@ class UnicodeDammit:
return None
self.tried_encodings.append((proposed, errors))
markup = self.markup
-
# Convert smart quotes to HTML if coming from an encoding
# that might have them.
if (self.smart_quotes_to is not None
- and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
+ and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
smart_quotes_re = b"([\x80-\x9f])"
smart_quotes_compiled = re.compile(smart_quotes_re)
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
@@ -287,99 +425,24 @@ class UnicodeDammit:
def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
+ return unicode(data, encoding, errors)
- # strip Byte Order Mark (if present)
- if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
- and (data[2:4] != '\x00\x00'):
- encoding = 'utf-16be'
- data = data[2:]
- elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
- and (data[2:4] != '\x00\x00'):
- encoding = 'utf-16le'
- data = data[2:]
- elif data[:3] == '\xef\xbb\xbf':
- encoding = 'utf-8'
- data = data[3:]
- elif data[:4] == '\x00\x00\xfe\xff':
- encoding = 'utf-32be'
- data = data[4:]
- elif data[:4] == '\xff\xfe\x00\x00':
- encoding = 'utf-32le'
- data = data[4:]
- newdata = unicode(data, encoding, errors)
- return newdata
-
- def _detectEncoding(self, xml_data, is_html=False):
- """Given a document, tries to detect its XML encoding."""
- xml_encoding = sniffed_xml_encoding = None
- try:
- if xml_data[:4] == b'\x4c\x6f\xa7\x94':
- # EBCDIC
- xml_data = self._ebcdic_to_ascii(xml_data)
- elif xml_data[:4] == b'\x00\x3c\x00\x3f':
- # UTF-16BE
- sniffed_xml_encoding = 'utf-16be'
- xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
- elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
- and (xml_data[2:4] != b'\x00\x00'):
- # UTF-16BE with BOM
- sniffed_xml_encoding = 'utf-16be'
- xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
- elif xml_data[:4] == b'\x3c\x00\x3f\x00':
- # UTF-16LE
- sniffed_xml_encoding = 'utf-16le'
- xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
- elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
- (xml_data[2:4] != b'\x00\x00'):
- # UTF-16LE with BOM
- sniffed_xml_encoding = 'utf-16le'
- xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
- elif xml_data[:4] == b'\x00\x00\x00\x3c':
- # UTF-32BE
- sniffed_xml_encoding = 'utf-32be'
- xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
- elif xml_data[:4] == b'\x3c\x00\x00\x00':
- # UTF-32LE
- sniffed_xml_encoding = 'utf-32le'
- xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
- elif xml_data[:4] == b'\x00\x00\xfe\xff':
- # UTF-32BE with BOM
- sniffed_xml_encoding = 'utf-32be'
- xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
- elif xml_data[:4] == b'\xff\xfe\x00\x00':
- # UTF-32LE with BOM
- sniffed_xml_encoding = 'utf-32le'
- xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
- elif xml_data[:3] == b'\xef\xbb\xbf':
- # UTF-8 with BOM
- sniffed_xml_encoding = 'utf-8'
- xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
- else:
- sniffed_xml_encoding = 'ascii'
- pass
- except:
- xml_encoding_match = None
- xml_encoding_match = xml_encoding_re.match(xml_data)
- if not xml_encoding_match and is_html:
- xml_encoding_match = html_meta_re.search(xml_data)
- if xml_encoding_match is not None:
- xml_encoding = xml_encoding_match.groups()[0].decode(
- 'ascii').lower()
- if is_html:
- self.declared_html_encoding = xml_encoding
- if sniffed_xml_encoding and \
- (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
- 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
- 'utf-16', 'utf-32', 'utf_16', 'utf_32',
- 'utf16', 'u16')):
- xml_encoding = sniffed_xml_encoding
- return xml_data, xml_encoding, sniffed_xml_encoding
+ @property
+ def declared_html_encoding(self):
+ if not self.is_html:
+ return None
+ return self.detector.declared_encoding
def find_codec(self, charset):
- return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
- or (charset and self._codec(charset.replace("-", ""))) \
- or (charset and self._codec(charset.replace("-", "_"))) \
+ value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
+ or (charset and self._codec(charset.replace("-", "")))
+ or (charset and self._codec(charset.replace("-", "_")))
+ or (charset and charset.lower())
or charset
+ )
+ if value:
+ return value.lower()
+ return None
def _codec(self, charset):
if not charset:
@@ -392,32 +455,6 @@ class UnicodeDammit:
pass
return codec
- EBCDIC_TO_ASCII_MAP = None
-
- def _ebcdic_to_ascii(self, s):
- c = self.__class__
- if not c.EBCDIC_TO_ASCII_MAP:
- emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
- 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
- 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
- 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
- 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
- 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
- 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
- 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
- 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
- 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
- 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
- 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
- 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
- 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
- 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
- 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
- 250,251,252,253,254,255)
- import string
- c.EBCDIC_TO_ASCII_MAP = string.maketrans(
- ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
- return s.translate(c.EBCDIC_TO_ASCII_MAP)
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
MS_CHARS = {b'\x80': ('euro', '20AC'),
diff --git a/libs/bs4/diagnose.py b/libs/bs4/diagnose.py
new file mode 100644
index 00000000..4d0b00af
--- /dev/null
+++ b/libs/bs4/diagnose.py
@@ -0,0 +1,204 @@
+"""Diagnostic functions, mainly for use when doing tech support."""
+import cProfile
+from StringIO import StringIO
+from HTMLParser import HTMLParser
+import bs4
+from bs4 import BeautifulSoup, __version__
+from bs4.builder import builder_registry
+
+import os
+import pstats
+import random
+import tempfile
+import time
+import traceback
+import sys
+import cProfile
+
+def diagnose(data):
+ """Diagnostic suite for isolating common problems."""
+ print "Diagnostic running on Beautiful Soup %s" % __version__
+ print "Python version %s" % sys.version
+
+ basic_parsers = ["html.parser", "html5lib", "lxml"]
+ for name in basic_parsers:
+ for builder in builder_registry.builders:
+ if name in builder.features:
+ break
+ else:
+ basic_parsers.remove(name)
+ print (
+ "I noticed that %s is not installed. Installing it may help." %
+ name)
+
+ if 'lxml' in basic_parsers:
+ basic_parsers.append(["lxml", "xml"])
+ from lxml import etree
+ print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+
+ if 'html5lib' in basic_parsers:
+ import html5lib
+ print "Found html5lib version %s" % html5lib.__version__
+
+ if hasattr(data, 'read'):
+ data = data.read()
+ elif os.path.exists(data):
+ print '"%s" looks like a filename. Reading data from the file.' % data
+ data = open(data).read()
+ elif data.startswith("http:") or data.startswith("https:"):
+ print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
+ print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
+ return
+ print
+
+ for parser in basic_parsers:
+ print "Trying to parse your markup with %s" % parser
+ success = False
+ try:
+ soup = BeautifulSoup(data, parser)
+ success = True
+ except Exception, e:
+ print "%s could not parse the markup." % parser
+ traceback.print_exc()
+ if success:
+ print "Here's what %s did with the markup:" % parser
+ print soup.prettify()
+
+ print "-" * 80
+
+def lxml_trace(data, html=True, **kwargs):
+ """Print out the lxml events that occur during parsing.
+
+ This lets you see how lxml parses a document when no Beautiful
+ Soup code is running.
+ """
+ from lxml import etree
+ for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
+ print("%s, %4s, %s" % (event, element.tag, element.text))
+
+class AnnouncingParser(HTMLParser):
+ """Announces HTMLParser parse events, without doing anything else."""
+
+ def _p(self, s):
+ print(s)
+
+ def handle_starttag(self, name, attrs):
+ self._p("%s START" % name)
+
+ def handle_endtag(self, name):
+ self._p("%s END" % name)
+
+ def handle_data(self, data):
+ self._p("%s DATA" % data)
+
+ def handle_charref(self, name):
+ self._p("%s CHARREF" % name)
+
+ def handle_entityref(self, name):
+ self._p("%s ENTITYREF" % name)
+
+ def handle_comment(self, data):
+ self._p("%s COMMENT" % data)
+
+ def handle_decl(self, data):
+ self._p("%s DECL" % data)
+
+ def unknown_decl(self, data):
+ self._p("%s UNKNOWN-DECL" % data)
+
+ def handle_pi(self, data):
+ self._p("%s PI" % data)
+
+def htmlparser_trace(data):
+ """Print out the HTMLParser events that occur during parsing.
+
+ This lets you see how HTMLParser parses a document when no
+ Beautiful Soup code is running.
+ """
+ parser = AnnouncingParser()
+ parser.feed(data)
+
+_vowels = "aeiou"
+_consonants = "bcdfghjklmnpqrstvwxyz"
+
+def rword(length=5):
+ "Generate a random word-like string."
+ s = ''
+ for i in range(length):
+ if i % 2 == 0:
+ t = _consonants
+ else:
+ t = _vowels
+ s += random.choice(t)
+ return s
+
+def rsentence(length=4):
+ "Generate a random sentence-like string."
+ return " ".join(rword(random.randint(4,9)) for i in range(length))
+
+def rdoc(num_elements=1000):
+ """Randomly generate an invalid HTML document."""
+ tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
+ elements = []
+ for i in range(num_elements):
+ choice = random.randint(0,3)
+ if choice == 0:
+ # New tag.
+ tag_name = random.choice(tag_names)
+ elements.append("<%s>" % tag_name)
+ elif choice == 1:
+ elements.append(rsentence(random.randint(1,4)))
+ elif choice == 2:
+ # Close a tag.
+ tag_name = random.choice(tag_names)
+ elements.append("%s>" % tag_name)
+ return "" + "\n".join(elements) + ""
+
+def benchmark_parsers(num_elements=100000):
+ """Very basic head-to-head performance benchmark."""
+ print "Comparative parser benchmark on Beautiful Soup %s" % __version__
+ data = rdoc(num_elements)
+ print "Generated a large invalid HTML document (%d bytes)." % len(data)
+
+ for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
+ success = False
+ try:
+ a = time.time()
+ soup = BeautifulSoup(data, parser)
+ b = time.time()
+ success = True
+ except Exception, e:
+ print "%s could not parse the markup." % parser
+ traceback.print_exc()
+ if success:
+ print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
+
+ from lxml import etree
+ a = time.time()
+ etree.HTML(data)
+ b = time.time()
+ print "Raw lxml parsed the markup in %.2fs." % (b-a)
+
+ import html5lib
+ parser = html5lib.HTMLParser()
+ a = time.time()
+ parser.parse(data)
+ b = time.time()
+ print "Raw html5lib parsed the markup in %.2fs." % (b-a)
+
+def profile(num_elements=100000, parser="lxml"):
+
+ filehandle = tempfile.NamedTemporaryFile()
+ filename = filehandle.name
+
+ data = rdoc(num_elements)
+ vars = dict(bs4=bs4, data=data, parser=parser)
+ cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
+
+ stats = pstats.Stats(filename)
+ # stats.strip_dirs()
+ stats.sort_stats("cumulative")
+ stats.print_stats('_html5lib|bs4', 50)
+
+if __name__ == '__main__':
+ diagnose(sys.stdin.read())
diff --git a/libs/bs4/element.py b/libs/bs4/element.py
index 91a40078..da9afdf4 100644
--- a/libs/bs4/element.py
+++ b/libs/bs4/element.py
@@ -26,6 +26,9 @@ class NamespacedAttribute(unicode):
def __new__(cls, prefix, name, namespace=None):
if name is None:
obj = unicode.__new__(cls, prefix)
+ elif prefix is None:
+ # Not really namespaced.
+ obj = unicode.__new__(cls, name)
else:
obj = unicode.__new__(cls, prefix + ":" + name)
obj.prefix = prefix
@@ -78,6 +81,40 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
+class HTMLAwareEntitySubstitution(EntitySubstitution):
+
+ """Entity substitution rules that are aware of some HTML quirks.
+
+ Specifically, the contents of
+"""
+ soup = BeautifulSoup(doc, "xml")
+ # lxml would have stripped this while parsing, but we can add
+ # it later.
+ soup.script.string = 'console.log("< < hey > > ");'
+ encoded = soup.encode()
+ self.assertTrue(b"< < hey > >" in encoded)
+
+ def test_can_parse_unicode_document(self):
+ markup = u'